mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 12:10:18 +00:00
Merge branch 'master' into compilade/imatrix-batched-chunks
This commit is contained in:
commit
3ad0603c65
@ -1,18 +1,16 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
@ -24,13 +22,12 @@ WORKDIR /app
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
# Enable cURL
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
ENV LLAMA_CURL=1
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
|
cp build/bin/* .
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
|||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git
|
apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
|
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
RUN make -j$(nproc) llama-cli
|
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
|||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
# Enable cURL
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
ENV LLAMA_CURL=1
|
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -26,6 +26,8 @@ RUN apt-get update && \
|
|||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|||||||
ENV GGML_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
|||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -21,6 +21,8 @@ RUN apt-get update && \
|
|||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -1,13 +1,52 @@
|
|||||||
|
{ inputs, ... }:
|
||||||
|
|
||||||
{
|
{
|
||||||
perSystem =
|
perSystem =
|
||||||
{ config, lib, ... }:
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
system,
|
||||||
|
...
|
||||||
|
}:
|
||||||
{
|
{
|
||||||
devShells =
|
devShells =
|
||||||
lib.concatMapAttrs
|
let
|
||||||
(name: package: {
|
pkgs = import inputs.nixpkgs { inherit system; };
|
||||||
${name} = package.passthru.shell;
|
stdenv = pkgs.stdenv;
|
||||||
${name + "-extra"} = package.passthru.shell-extra;
|
scripts = config.packages.python-scripts;
|
||||||
})
|
in
|
||||||
config.packages;
|
lib.pipe (config.packages) [
|
||||||
|
(lib.concatMapAttrs (
|
||||||
|
name: package: {
|
||||||
|
${name} = pkgs.mkShell {
|
||||||
|
name = "${name}";
|
||||||
|
inputsFrom = [ package ];
|
||||||
|
shellHook = ''
|
||||||
|
echo "Entering ${name} devShell"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
"${name}-extra" =
|
||||||
|
if (name == "python-scripts") then
|
||||||
|
null
|
||||||
|
else
|
||||||
|
pkgs.mkShell {
|
||||||
|
name = "${name}-extra";
|
||||||
|
inputsFrom = [
|
||||||
|
package
|
||||||
|
scripts
|
||||||
|
];
|
||||||
|
# Extra packages that *may* be used by some scripts
|
||||||
|
packages = [
|
||||||
|
pkgs.python3Packages.tiktoken
|
||||||
|
];
|
||||||
|
shellHook = ''
|
||||||
|
echo "Entering ${name} devShell"
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
))
|
||||||
|
(lib.filterAttrs (name: value: value != null))
|
||||||
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -26,16 +26,14 @@
|
|||||||
config.cudaSupport = true;
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate =
|
config.allowUnfreePredicate =
|
||||||
p:
|
p:
|
||||||
builtins.all
|
builtins.all (
|
||||||
(
|
license:
|
||||||
license:
|
license.free
|
||||||
license.free
|
|| builtins.elem license.shortName [
|
||||||
|| builtins.elem license.shortName [
|
"CUDA EULA"
|
||||||
"CUDA EULA"
|
"cuDNN EULA"
|
||||||
"cuDNN EULA"
|
]
|
||||||
]
|
) (p.meta.licenses or [ p.meta.license ]);
|
||||||
)
|
|
||||||
(p.meta.licenses or [ p.meta.license ]);
|
|
||||||
};
|
};
|
||||||
# Ensure dependencies use ROCm consistently
|
# Ensure dependencies use ROCm consistently
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
pkgsRocm = import inputs.nixpkgs {
|
||||||
|
36
.devops/nix/package-gguf-py.nix
Normal file
36
.devops/nix/package-gguf-py.nix
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
lib,
|
||||||
|
llamaVersion,
|
||||||
|
numpy,
|
||||||
|
tqdm,
|
||||||
|
sentencepiece,
|
||||||
|
pyyaml,
|
||||||
|
poetry-core,
|
||||||
|
buildPythonPackage,
|
||||||
|
pytestCheckHook,
|
||||||
|
}:
|
||||||
|
|
||||||
|
buildPythonPackage {
|
||||||
|
pname = "gguf";
|
||||||
|
version = llamaVersion;
|
||||||
|
pyproject = true;
|
||||||
|
nativeBuildInputs = [ poetry-core ];
|
||||||
|
propagatedBuildInputs = [
|
||||||
|
numpy
|
||||||
|
tqdm
|
||||||
|
sentencepiece
|
||||||
|
pyyaml
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../gguf-py;
|
||||||
|
pythonImportsCheck = [
|
||||||
|
"numpy"
|
||||||
|
"gguf"
|
||||||
|
];
|
||||||
|
nativeCheckInputs = [ pytestCheckHook ];
|
||||||
|
doCheck = true;
|
||||||
|
meta = with lib; {
|
||||||
|
description = "Python package for writing binary files in the GGUF format";
|
||||||
|
license = licenses.mit;
|
||||||
|
maintainers = [ maintainers.ditsuke ];
|
||||||
|
};
|
||||||
|
}
|
@ -3,31 +3,33 @@
|
|||||||
glibc,
|
glibc,
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
mkShell,
|
|
||||||
runCommand,
|
runCommand,
|
||||||
cmake,
|
cmake,
|
||||||
ninja,
|
ninja,
|
||||||
pkg-config,
|
pkg-config,
|
||||||
git,
|
git,
|
||||||
python3,
|
|
||||||
mpi,
|
mpi,
|
||||||
blas,
|
blas,
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
|
autoAddDriverRunpath,
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
curl,
|
||||||
shaderc,
|
shaderc,
|
||||||
useBlas ? builtins.all (x: !x) [
|
useBlas ?
|
||||||
useCuda
|
builtins.all (x: !x) [
|
||||||
useMetalKit
|
useCuda
|
||||||
useRocm
|
useMetalKit
|
||||||
useVulkan
|
useRocm
|
||||||
] && blas.meta.available,
|
useVulkan
|
||||||
|
]
|
||||||
|
&& blas.meta.available,
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
# Increases the runtime closure size by ~700M
|
||||||
|
useMpi ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
enableCurl ? true,
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
@ -37,8 +39,8 @@
|
|||||||
# otherwise we get libstdc++ errors downstream.
|
# otherwise we get libstdc++ errors downstream.
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||||
precompileMetalShaders ? false
|
precompileMetalShaders ? false,
|
||||||
}@inputs:
|
}:
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib)
|
inherit (lib)
|
||||||
@ -46,7 +48,6 @@ let
|
|||||||
cmakeFeature
|
cmakeFeature
|
||||||
optionals
|
optionals
|
||||||
strings
|
strings
|
||||||
versionOlder
|
|
||||||
;
|
;
|
||||||
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
@ -62,54 +63,11 @@ let
|
|||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||||
descriptionSuffix =
|
descriptionSuffix = strings.optionalString (
|
||||||
strings.optionalString (suffices != [ ])
|
suffices != [ ]
|
||||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
xcrunHost = runCommand "xcrunHost" { } ''
|
||||||
|
|
||||||
# TODO: package the Python in this repository in a Nix-like way.
|
|
||||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
|
||||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
|
||||||
# https://peps.python.org/pep-0517/
|
|
||||||
#
|
|
||||||
# TODO: Package up each Python script or service appropriately, by making
|
|
||||||
# them into "entrypoints"
|
|
||||||
llama-python = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
|
||||||
llama-python-extra = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
ps.tiktoken
|
|
||||||
ps.torchWithoutCuda
|
|
||||||
ps.transformers
|
|
||||||
|
|
||||||
# server bench
|
|
||||||
ps.matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
ps.openai
|
|
||||||
ps.behave
|
|
||||||
ps.prometheus-client
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
ps.docstring-parser
|
|
||||||
ps.pydantic
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
ps.gitpython
|
|
||||||
ps.tabulate
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" {} ''
|
|
||||||
mkdir -p $out/bin
|
mkdir -p $out/bin
|
||||||
ln -s /usr/bin/xcrun $out/bin
|
ln -s /usr/bin/xcrun $out/bin
|
||||||
'';
|
'';
|
||||||
@ -144,181 +102,145 @@ let
|
|||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (
|
effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
finalAttrs: {
|
pname = "llama-cpp${pnameSuffix}";
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
version = llamaVersion;
|
||||||
version = llamaVersion;
|
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
# Note: none of the files discarded here are visible in the sandbox or
|
||||||
# affect the output hash. This also means they can be modified without
|
# affect the output hash. This also means they can be modified without
|
||||||
# triggering a rebuild.
|
# triggering a rebuild.
|
||||||
src = lib.cleanSourceWith {
|
src = lib.cleanSourceWith {
|
||||||
filter =
|
filter =
|
||||||
name: type:
|
name: type:
|
||||||
let
|
let
|
||||||
noneOf = builtins.all (x: !x);
|
noneOf = builtins.all (x: !x);
|
||||||
baseName = baseNameOf name;
|
baseName = baseNameOf name;
|
||||||
in
|
in
|
||||||
noneOf [
|
noneOf [
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||||
(baseName == "flake.lock")
|
(baseName == "flake.lock")
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
|
||||||
'';
|
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
|
||||||
# and not on $PATH
|
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
|
||||||
|
|
||||||
nativeBuildInputs =
|
|
||||||
[
|
|
||||||
cmake
|
|
||||||
ninja
|
|
||||||
pkg-config
|
|
||||||
git
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
cudaPackages.cuda_nvcc
|
|
||||||
|
|
||||||
# TODO: Replace with autoAddDriverRunpath
|
|
||||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
|
||||||
cudaPackages.autoAddOpenGLRunpathHook
|
|
||||||
]
|
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
|
||||||
glibc.static
|
|
||||||
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
|
|
||||||
xcrunHost
|
|
||||||
];
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
|
||||||
buildInputs =
|
postPatch = ''
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
++ optionals useCuda cudaBuildInputs
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
++ optionals useMpi [ mpi ]
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
++ optionals useRocm rocmBuildInputs
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
++ optionals useBlas [ blas ]
|
'';
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
|
||||||
++ optionals enableCurl [ curl ];
|
|
||||||
|
|
||||||
cmakeFlags =
|
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
||||||
[
|
# `default.metallib` may be compiled with Metal compiler from XCode
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
# and we need to escape sandbox on MacOS to access Metal compiler.
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
# `xcrun` is used find the path of the Metal compiler, which is varible
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
# and not on $PATH
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
nativeBuildInputs =
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
[
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
cmake
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
ninja
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
pkg-config
|
||||||
]
|
git
|
||||||
++ optionals useCuda [
|
]
|
||||||
(
|
++ optionals useCuda [
|
||||||
with cudaPackages.flags;
|
cudaPackages.cuda_nvcc
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
autoAddDriverRunpath
|
||||||
)
|
]
|
||||||
|
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
||||||
|
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
||||||
|
|
||||||
|
buildInputs =
|
||||||
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
|
++ optionals useCuda cudaBuildInputs
|
||||||
|
++ optionals useMpi [ mpi ]
|
||||||
|
++ optionals useRocm rocmBuildInputs
|
||||||
|
++ optionals useBlas [ blas ]
|
||||||
|
++ optionals useVulkan vulkanBuildInputs
|
||||||
|
++ optionals enableCurl [ curl ];
|
||||||
|
|
||||||
|
cmakeFlags =
|
||||||
|
[
|
||||||
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
|
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||||
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
|
]
|
||||||
|
++ optionals useCuda [
|
||||||
|
(
|
||||||
|
with cudaPackages.flags;
|
||||||
|
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
||||||
|
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
||||||
)
|
)
|
||||||
]
|
)
|
||||||
++ optionals useRocm [
|
]
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
++ optionals useRocm [
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||||
]
|
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||||
++ optionals useMetalKit [
|
]
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
++ optionals useMetalKit [
|
||||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
];
|
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
|
];
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Environment variables needed for ROCm
|
||||||
env = optionals useRocm {
|
env = optionals useRocm {
|
||||||
ROCM_PATH = "${rocmPackages.clr}";
|
ROCM_PATH = "${rocmPackages.clr}";
|
||||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
||||||
};
|
};
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/include/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
meta = {
|
||||||
passthru = {
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
inherit
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
useBlas
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
useCuda
|
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||||
useMetalKit
|
|
||||||
useMpi
|
|
||||||
useRocm
|
|
||||||
useVulkan
|
|
||||||
;
|
|
||||||
|
|
||||||
shell = mkShell {
|
# Configurations that are known to result in build failures. Can be
|
||||||
name = "shell-${finalAttrs.finalPackage.name}";
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
description = "contains numpy and sentencepiece";
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
buildInputs = [ llama-python ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
shellHook = ''
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
|
|
||||||
shell-extra = mkShell {
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
license = lib.licenses.mit;
|
||||||
buildInputs = [ llama-python-extra ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
meta = {
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
mainProgram = "llama-cli";
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
# Consider adding yourself to this list if you want to ensure this flake
|
||||||
|
# stays maintained and you're willing to invest your time. Do not add
|
||||||
|
# other people without their consent. Consider removing people after
|
||||||
|
# they've been unreachable for long periods of time.
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
# an attrset following the same format as in
|
||||||
license = lib.licenses.mit;
|
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
||||||
|
maintainers = with lib.maintainers; [
|
||||||
|
philiptaron
|
||||||
|
SomeoneSerge
|
||||||
|
];
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Extend `badPlatforms` instead
|
||||||
mainProgram = "llama-cli";
|
platforms = lib.platforms.all;
|
||||||
|
};
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
})
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
|
||||||
# an attrset following the same format as in
|
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
|
||||||
platforms = lib.platforms.all;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
66
.devops/nix/python-scripts.nix
Normal file
66
.devops/nix/python-scripts.nix
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{
|
||||||
|
lib,
|
||||||
|
stdenv,
|
||||||
|
buildPythonPackage,
|
||||||
|
poetry-core,
|
||||||
|
mkShell,
|
||||||
|
python3Packages,
|
||||||
|
gguf-py,
|
||||||
|
}@inputs:
|
||||||
|
|
||||||
|
let
|
||||||
|
llama-python-deps = with python3Packages; [
|
||||||
|
numpy
|
||||||
|
sentencepiece
|
||||||
|
transformers
|
||||||
|
protobuf
|
||||||
|
torchWithoutCuda
|
||||||
|
gguf-py
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# for scripts/compare-llama-bench.py
|
||||||
|
gitpython
|
||||||
|
tabulate
|
||||||
|
|
||||||
|
# for examples/pydantic-models-to-grammar-examples.py
|
||||||
|
docstring-parser
|
||||||
|
pydantic
|
||||||
|
|
||||||
|
];
|
||||||
|
|
||||||
|
llama-python-test-deps = with python3Packages; [
|
||||||
|
# Server bench
|
||||||
|
matplotlib
|
||||||
|
|
||||||
|
# server tests
|
||||||
|
openai
|
||||||
|
behave
|
||||||
|
prometheus-client
|
||||||
|
];
|
||||||
|
in
|
||||||
|
|
||||||
|
buildPythonPackage ({
|
||||||
|
pname = "llama-scripts";
|
||||||
|
version = "0.0.0";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
||||||
|
# do they affect the output hash. They can be modified without triggering a rebuild.
|
||||||
|
src = lib.cleanSourceWith {
|
||||||
|
filter =
|
||||||
|
name: type:
|
||||||
|
let
|
||||||
|
any = builtins.any (x: x);
|
||||||
|
baseName = builtins.baseNameOf name;
|
||||||
|
in
|
||||||
|
any [
|
||||||
|
(lib.hasSuffix ".py" name)
|
||||||
|
(baseName == "README.md")
|
||||||
|
(baseName == "pyproject.toml")
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
nativeBuildInputs = [ poetry-core ];
|
||||||
|
nativeCheckInputs = llama-python-test-deps;
|
||||||
|
dependencies = llama-python-deps;
|
||||||
|
})
|
@ -1,19 +1,41 @@
|
|||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
|
python3,
|
||||||
llamaVersion ? "0.0.0",
|
llamaVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
|
let
|
||||||
|
pythonPackages = python3.pkgs;
|
||||||
|
buildPythonPackage = pythonPackages.buildPythonPackage;
|
||||||
|
numpy = pythonPackages.numpy;
|
||||||
|
tqdm = pythonPackages.tqdm;
|
||||||
|
sentencepiece = pythonPackages.sentencepiece;
|
||||||
|
pyyaml = pythonPackages.pyyaml;
|
||||||
|
poetry-core = pythonPackages.poetry-core;
|
||||||
|
pytestCheckHook = pythonPackages.pytestCheckHook;
|
||||||
|
in
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
# We're using `makeScope` instead of just writing out an attrset
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
# because it allows users to apply overlays later using `overrideScope'`.
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (
|
lib.makeScope newScope (self: {
|
||||||
self: {
|
inherit llamaVersion;
|
||||||
inherit llamaVersion;
|
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
inherit
|
||||||
docker = self.callPackage ./docker.nix { };
|
buildPythonPackage
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
numpy
|
||||||
sif = self.callPackage ./sif.nix { };
|
tqdm
|
||||||
}
|
sentencepiece
|
||||||
)
|
poetry-core
|
||||||
|
pyyaml
|
||||||
|
pytestCheckHook
|
||||||
|
;
|
||||||
|
};
|
||||||
|
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||||
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
|
docker = self.callPackage ./docker.nix { };
|
||||||
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
|
sif = self.callPackage ./sif.nix { };
|
||||||
|
})
|
||||||
|
2
.ecrc
2
.ecrc
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$"],
|
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -857,7 +857,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
|
21
.github/workflows/docker.yml
vendored
21
.github/workflows/docker.yml
vendored
@ -37,9 +37,9 @@ jobs:
|
|||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
@ -96,21 +96,12 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
- name: Build and push Docker image (tagged + versioned)
|
||||||
if: github.event_name == 'push'
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged)
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: ${{ github.event_name == 'push' }}
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -61,6 +61,7 @@ llama-batched-swift
|
|||||||
/rpc-server
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
autogen-*.md
|
||||||
|
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
|
||||||
|
@ -28,11 +28,12 @@
|
|||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
|
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||||
}
|
}
|
||||||
@ -40,8 +41,8 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
"name": "arm64-windows-llvm", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||||
}
|
}
|
||||||
@ -60,6 +61,8 @@
|
|||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
19
Makefile
19
Makefile
@ -39,10 +39,12 @@ BUILD_TARGETS = \
|
|||||||
llama-tokenize \
|
llama-tokenize \
|
||||||
llama-vdot \
|
llama-vdot \
|
||||||
llama-cvector-generator \
|
llama-cvector-generator \
|
||||||
|
llama-gen-docs \
|
||||||
tests/test-c.o
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
tests/test-arg-parser \
|
||||||
tests/test-autorelease \
|
tests/test-autorelease \
|
||||||
tests/test-backend-ops \
|
tests/test-backend-ops \
|
||||||
tests/test-chat-template \
|
tests/test-chat-template \
|
||||||
@ -927,7 +929,6 @@ OBJ_COMMON = \
|
|||||||
common/ngram-cache.o \
|
common/ngram-cache.o \
|
||||||
common/sampling.o \
|
common/sampling.o \
|
||||||
common/train.o \
|
common/train.o \
|
||||||
common/grammar-parser.o \
|
|
||||||
common/build-info.o \
|
common/build-info.o \
|
||||||
common/json-schema-to-grammar.o
|
common/json-schema-to-grammar.o
|
||||||
|
|
||||||
@ -1167,11 +1168,6 @@ common/console.o: \
|
|||||||
common/console.h
|
common/console.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/grammar-parser.o: \
|
|
||||||
common/grammar-parser.cpp \
|
|
||||||
common/grammar-parser.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/json-schema-to-grammar.o: \
|
common/json-schema-to-grammar.o: \
|
||||||
common/json-schema-to-grammar.cpp \
|
common/json-schema-to-grammar.cpp \
|
||||||
common/json-schema-to-grammar.h
|
common/json-schema-to-grammar.h
|
||||||
@ -1448,6 +1444,12 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
|||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
|
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
./llama-gen-docs
|
||||||
|
|
||||||
libllava.a: examples/llava/llava.cpp \
|
libllava.a: examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
examples/llava/clip.cpp \
|
examples/llava/clip.cpp \
|
||||||
@ -1505,6 +1507,11 @@ run-benchmark-matmult: llama-benchmark-matmult
|
|||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
|
tests/test-arg-parser: tests/test-arg-parser.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
25
README.md
25
README.md
@ -10,32 +10,14 @@
|
|||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
|
||||||
|
|
||||||
## Recent API changes
|
## Recent API changes
|
||||||
|
|
||||||
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
|
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
||||||
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
|
||||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
|
||||||
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
|
||||||
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
|
||||||
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
|
|
||||||
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
- *add hot topics here*
|
||||||
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
||||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
|
||||||
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
|
||||||
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
|
|
||||||
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
|
|
||||||
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
|
|
||||||
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
|
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
@ -106,6 +88,7 @@ Typically finetunes of the base models below are supported as well.
|
|||||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||||
|
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||||
|
|
||||||
|
29
ci/run.sh
29
ci/run.sh
@ -13,6 +13,9 @@
|
|||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with VULKAN support
|
||||||
|
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||||
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
|
|||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
@ -138,7 +145,7 @@ function gg_run_ctest_release {
|
|||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_llama_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# pythia_2_8b
|
# pythia_2_8b
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
function gg_run_pythia_2_8b {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -761,7 +766,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
@ -58,8 +58,6 @@ add_library(${TARGET} STATIC
|
|||||||
sampling.cpp
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
|
||||||
grammar-parser.cpp
|
|
||||||
json.hpp
|
json.hpp
|
||||||
json-schema-to-grammar.cpp
|
json-schema-to-grammar.cpp
|
||||||
train.h
|
train.h
|
||||||
|
3442
common/common.cpp
3442
common/common.cpp
File diff suppressed because it is too large
Load Diff
153
common/common.h
153
common/common.h
@ -14,8 +14,10 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <set>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
@ -61,19 +63,43 @@ int32_t cpu_get_num_math();
|
|||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
|
enum llama_example {
|
||||||
|
LLAMA_EXAMPLE_COMMON,
|
||||||
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
|
LLAMA_EXAMPLE_MAIN,
|
||||||
|
LLAMA_EXAMPLE_INFILL,
|
||||||
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
LLAMA_EXAMPLE_PASSKEY,
|
||||||
|
LLAMA_EXAMPLE_IMATRIX,
|
||||||
|
LLAMA_EXAMPLE_BENCH,
|
||||||
|
LLAMA_EXAMPLE_SERVER,
|
||||||
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
|
LLAMA_EXAMPLE_LLAVA,
|
||||||
|
|
||||||
|
LLAMA_EXAMPLE_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
enum dimre_method {
|
enum dimre_method {
|
||||||
DIMRE_METHOD_PCA,
|
DIMRE_METHOD_PCA,
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct cpu_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params {
|
||||||
|
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
||||||
|
|
||||||
int32_t n_threads = cpu_get_num_math();
|
|
||||||
int32_t n_threads_draft = -1;
|
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
||||||
int32_t n_threads_batch_draft = -1;
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
@ -100,6 +126,11 @@ struct gpt_params {
|
|||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
struct cpu_params draft_cpuparams;
|
||||||
|
struct cpu_params draft_cpuparams_batch;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
@ -110,8 +141,7 @@ struct gpt_params {
|
|||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
// // sampling parameters
|
struct gpt_sampler_params sparams;
|
||||||
struct llama_sampling_params sparams;
|
|
||||||
|
|
||||||
std::string model = ""; // model path
|
std::string model = ""; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
@ -159,6 +189,7 @@ struct gpt_params {
|
|||||||
|
|
||||||
bool kl_divergence = false; // compute KL divergence
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
|
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
||||||
bool usage = false; // print usage
|
bool usage = false; // print usage
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool special = false; // enable special token output
|
bool special = false; // enable special token output
|
||||||
@ -175,7 +206,6 @@ struct gpt_params {
|
|||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
@ -204,7 +234,7 @@ struct gpt_params {
|
|||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = "";
|
std::string public_path = "";
|
||||||
@ -265,18 +295,104 @@ struct gpt_params {
|
|||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_handle_hf_token(gpt_params & params);
|
struct llama_arg {
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::vector<const char *> args;
|
||||||
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
const char * env = nullptr;
|
||||||
|
std::string help;
|
||||||
|
void (*handler_void) (gpt_params & params) = nullptr;
|
||||||
|
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||||
|
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
|
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||||
|
|
||||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
llama_arg(
|
||||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
const std::initializer_list<const char *> & args,
|
||||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
const char * value_hint,
|
||||||
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, int)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params)
|
||||||
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
// support 2 values for arg
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const char * value_hint_2,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
|
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
||||||
|
this->examples = std::move(examples);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_arg & set_env(const char * env) {
|
||||||
|
help = help + "\n(env: " + env + ")";
|
||||||
|
this->env = env;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool in_example(enum llama_example ex) {
|
||||||
|
return examples.find(ex) != examples.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool get_value_from_env(std::string & output) const {
|
||||||
|
if (env == nullptr) return false;
|
||||||
|
char * value = std::getenv(env);
|
||||||
|
if (value) {
|
||||||
|
output = value;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool has_value_from_env() const {
|
||||||
|
return env != nullptr && std::getenv(env);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
// initialize list of options (arguments) that can be used by the current example
|
||||||
|
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
||||||
|
// optionally, we can provide "print_usage" to print example usage
|
||||||
|
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
||||||
|
|
||||||
|
// parse input arguments from CLI
|
||||||
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||||
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||||
|
|
||||||
|
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
||||||
|
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
@ -327,8 +443,9 @@ struct llama_init_result {
|
|||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
@ -1,539 +0,0 @@
|
|||||||
#include "grammar-parser.h"
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cwchar>
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <exception>
|
|
||||||
|
|
||||||
namespace grammar_parser {
|
|
||||||
// NOTE: assumes valid utf8 (but checks for overrun)
|
|
||||||
// copied from llama.cpp
|
|
||||||
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
|
||||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
|
||||||
uint8_t first_byte = static_cast<uint8_t>(*src);
|
|
||||||
uint8_t highbits = first_byte >> 4;
|
|
||||||
int len = lookup[highbits];
|
|
||||||
uint8_t mask = (1 << (8 - len)) - 1;
|
|
||||||
uint32_t value = first_byte & mask;
|
|
||||||
const char * end = src + len; // may overrun!
|
|
||||||
const char * pos = src + 1;
|
|
||||||
for ( ; pos < end && *pos; pos++) {
|
|
||||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
|
||||||
}
|
|
||||||
return std::make_pair(value, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
|
||||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
||||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
|
||||||
return result.first->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
|
||||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
||||||
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
|
||||||
return next_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void add_rule(
|
|
||||||
parse_state & state,
|
|
||||||
uint32_t rule_id,
|
|
||||||
const std::vector<llama_grammar_element> & rule) {
|
|
||||||
if (state.rules.size() <= rule_id) {
|
|
||||||
state.rules.resize(rule_id + 1);
|
|
||||||
}
|
|
||||||
state.rules[rule_id] = rule;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_digit_char(char c) {
|
|
||||||
return '0' <= c && c <= '9';
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_word_char(char c) {
|
|
||||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
|
||||||
const char * pos = src;
|
|
||||||
const char * end = src + size;
|
|
||||||
uint32_t value = 0;
|
|
||||||
for ( ; pos < end && *pos; pos++) {
|
|
||||||
value <<= 4;
|
|
||||||
char c = *pos;
|
|
||||||
if ('a' <= c && c <= 'f') {
|
|
||||||
value += c - 'a' + 10;
|
|
||||||
} else if ('A' <= c && c <= 'F') {
|
|
||||||
value += c - 'A' + 10;
|
|
||||||
} else if ('0' <= c && c <= '9') {
|
|
||||||
value += c - '0';
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (pos != end) {
|
|
||||||
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
|
|
||||||
}
|
|
||||||
return std::make_pair(value, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_space(const char * src, bool newline_ok) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
|
||||||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
|
||||||
if (*pos == '#') {
|
|
||||||
while (*pos && *pos != '\r' && *pos != '\n') {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_name(const char * src) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (is_word_char(*pos)) {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
if (pos == src) {
|
|
||||||
throw std::runtime_error(std::string("expecting name at ") + src);
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_int(const char * src) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (is_digit_char(*pos)) {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
if (pos == src) {
|
|
||||||
throw std::runtime_error(std::string("expecting integer at ") + src);
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|
||||||
if (*src == '\\') {
|
|
||||||
switch (src[1]) {
|
|
||||||
case 'x': return parse_hex(src + 2, 2);
|
|
||||||
case 'u': return parse_hex(src + 2, 4);
|
|
||||||
case 'U': return parse_hex(src + 2, 8);
|
|
||||||
case 't': return std::make_pair('\t', src + 2);
|
|
||||||
case 'r': return std::make_pair('\r', src + 2);
|
|
||||||
case 'n': return std::make_pair('\n', src + 2);
|
|
||||||
case '\\':
|
|
||||||
case '"':
|
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
return std::make_pair(src[1], src + 2);
|
|
||||||
default:
|
|
||||||
throw std::runtime_error(std::string("unknown escape at ") + src);
|
|
||||||
}
|
|
||||||
} else if (*src) {
|
|
||||||
return decode_utf8(src);
|
|
||||||
}
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * parse_alternates(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
uint32_t rule_id,
|
|
||||||
bool is_nested);
|
|
||||||
|
|
||||||
static const char * parse_sequence(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
std::vector<llama_grammar_element> & out_elements,
|
|
||||||
bool is_nested) {
|
|
||||||
size_t last_sym_start = out_elements.size();
|
|
||||||
const char * pos = src;
|
|
||||||
|
|
||||||
auto handle_repetitions = [&](int min_times, int max_times) {
|
|
||||||
|
|
||||||
if (last_sym_start == out_elements.size()) {
|
|
||||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
|
||||||
// the following rewrite rules:
|
|
||||||
// S{m,n} --> S S S (m times) S'(n-m)
|
|
||||||
// S'(x) ::= S S'(x-1) |
|
|
||||||
// (... n-m definitions of these S' rules ...)
|
|
||||||
// S'(1) ::= S |
|
|
||||||
// S{m,} --> S S S (m times) S'
|
|
||||||
// S' ::= S S' |
|
|
||||||
// S* --> S{0,}
|
|
||||||
// --> S' ::= S S' |
|
|
||||||
// S+ --> S{1,}
|
|
||||||
// --> S S'
|
|
||||||
// S' ::= S S' |
|
|
||||||
// S? --> S{0,1}
|
|
||||||
// --> S'
|
|
||||||
// S' ::= S |
|
|
||||||
|
|
||||||
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
if (min_times == 0) {
|
|
||||||
out_elements.resize(last_sym_start);
|
|
||||||
} else {
|
|
||||||
// Repeat the previous elements (min_times - 1) times
|
|
||||||
for (int i = 1; i < min_times; i++) {
|
|
||||||
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t last_rec_rule_id = 0;
|
|
||||||
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
|
||||||
|
|
||||||
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
|
||||||
for (int i = 0; i < n_opt; i++) {
|
|
||||||
rec_rule.resize(previous_elements.size());
|
|
||||||
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
if (i > 0 || max_times < 0) {
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
|
||||||
}
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, rec_rule_id, rec_rule);
|
|
||||||
last_rec_rule_id = rec_rule_id;
|
|
||||||
}
|
|
||||||
if (n_opt > 0) {
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
while (*pos) {
|
|
||||||
if (*pos == '"') { // literal string
|
|
||||||
pos++;
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
while (*pos != '"') {
|
|
||||||
if (!*pos) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto char_pair = parse_char(pos);
|
|
||||||
pos = char_pair.second;
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '[') { // char range(s)
|
|
||||||
pos++;
|
|
||||||
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
|
||||||
if (*pos == '^') {
|
|
||||||
pos++;
|
|
||||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
|
||||||
}
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
while (*pos != ']') {
|
|
||||||
if (!*pos) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto char_pair = parse_char(pos);
|
|
||||||
pos = char_pair.second;
|
|
||||||
enum llama_gretype type = last_sym_start < out_elements.size()
|
|
||||||
? LLAMA_GRETYPE_CHAR_ALT
|
|
||||||
: start_type;
|
|
||||||
|
|
||||||
out_elements.push_back({type, char_pair.first});
|
|
||||||
if (pos[0] == '-' && pos[1] != ']') {
|
|
||||||
if (!pos[1]) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto endchar_pair = parse_char(pos + 1);
|
|
||||||
pos = endchar_pair.second;
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (is_word_char(*pos)) { // rule reference
|
|
||||||
const char * name_end = parse_name(pos);
|
|
||||||
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
|
|
||||||
pos = parse_space(name_end, is_nested);
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
|
||||||
} else if (*pos == '(') { // grouping
|
|
||||||
// parse nested alternates into synthesized rule
|
|
||||||
pos = parse_space(pos + 1, true);
|
|
||||||
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
// output reference to synthesized rule
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
if (*pos != ')') {
|
|
||||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '.') { // any char
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '*') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(0, -1);
|
|
||||||
} else if (*pos == '+') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(1, -1);
|
|
||||||
} else if (*pos == '?') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(0, 1);
|
|
||||||
} else if (*pos == '{') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
|
|
||||||
if (!is_digit_char(*pos)) {
|
|
||||||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
|
||||||
}
|
|
||||||
const char * int_end = parse_int(pos);
|
|
||||||
int min_times = std::stoul(std::string(pos, int_end - pos));
|
|
||||||
pos = parse_space(int_end, is_nested);
|
|
||||||
|
|
||||||
int max_times = -1;
|
|
||||||
|
|
||||||
if (*pos == '}') {
|
|
||||||
max_times = min_times;
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == ',') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
|
|
||||||
if (is_digit_char(*pos)) {
|
|
||||||
const char * int_end = parse_int(pos);
|
|
||||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
|
||||||
pos = parse_space(int_end, is_nested);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*pos != '}') {
|
|
||||||
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
|
||||||
}
|
|
||||||
handle_repetitions(min_times, max_times);
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * parse_alternates(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
uint32_t rule_id,
|
|
||||||
bool is_nested) {
|
|
||||||
std::vector<llama_grammar_element> rule;
|
|
||||||
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
|
|
||||||
while (*pos == '|') {
|
|
||||||
rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
pos = parse_space(pos + 1, true);
|
|
||||||
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
|
|
||||||
}
|
|
||||||
rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, rule_id, rule);
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_rule(parse_state & state, const char * src) {
|
|
||||||
const char * name_end = parse_name(src);
|
|
||||||
const char * pos = parse_space(name_end, false);
|
|
||||||
size_t name_len = name_end - src;
|
|
||||||
uint32_t rule_id = get_symbol_id(state, src, name_len);
|
|
||||||
const std::string name(src, name_len);
|
|
||||||
|
|
||||||
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
|
||||||
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 3, true);
|
|
||||||
|
|
||||||
pos = parse_alternates(state, pos, name, rule_id, false);
|
|
||||||
|
|
||||||
if (*pos == '\r') {
|
|
||||||
pos += pos[1] == '\n' ? 2 : 1;
|
|
||||||
} else if (*pos == '\n') {
|
|
||||||
pos++;
|
|
||||||
} else if (*pos) {
|
|
||||||
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
|
||||||
}
|
|
||||||
return parse_space(pos, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_state parse(const char * src) {
|
|
||||||
try {
|
|
||||||
parse_state state;
|
|
||||||
const char * pos = parse_space(src, true);
|
|
||||||
while (*pos) {
|
|
||||||
pos = parse_rule(state, pos);
|
|
||||||
}
|
|
||||||
// Validate the state to ensure that all rules are defined
|
|
||||||
for (const auto & rule : state.rules) {
|
|
||||||
if (rule.empty()) {
|
|
||||||
throw std::runtime_error("Undefined rule");
|
|
||||||
}
|
|
||||||
for (const auto & elem : rule) {
|
|
||||||
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
|
||||||
// Ensure that the rule at that location exists
|
|
||||||
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
|
|
||||||
// Get the name of the rule that is missing
|
|
||||||
for (const auto & kv : state.symbol_ids) {
|
|
||||||
if (kv.second == elem.value) {
|
|
||||||
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return state;
|
|
||||||
} catch (const std::exception & err) {
|
|
||||||
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
|
||||||
return parse_state();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
|
||||||
if (0x20 <= c && c <= 0x7f) {
|
|
||||||
fprintf(file, "%c", static_cast<char>(c));
|
|
||||||
} else {
|
|
||||||
// cop out of encoding UTF-8
|
|
||||||
fprintf(file, "<U+%04X>", c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_char_element(llama_grammar_element elem) {
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_CHAR: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
|
||||||
default: return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
|
||||||
for (auto elem : rule) {
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
|
||||||
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
|
|
||||||
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
|
||||||
}
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END:
|
|
||||||
case LLAMA_GRETYPE_ALT:
|
|
||||||
case LLAMA_GRETYPE_RULE_REF:
|
|
||||||
fprintf(file, "(%u) ", elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR:
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
fprintf(file, "(\"");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
fprintf(file, "\") ");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_rule(
|
|
||||||
FILE * file,
|
|
||||||
uint32_t rule_id,
|
|
||||||
const std::vector<llama_grammar_element> & rule,
|
|
||||||
const std::map<uint32_t, std::string> & symbol_id_names) {
|
|
||||||
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
|
|
||||||
}
|
|
||||||
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
|
|
||||||
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
|
|
||||||
llama_grammar_element elem = rule[i];
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END:
|
|
||||||
throw std::runtime_error(
|
|
||||||
"unexpected end of rule: " + std::to_string(rule_id) + "," +
|
|
||||||
std::to_string(i));
|
|
||||||
case LLAMA_GRETYPE_ALT:
|
|
||||||
fprintf(file, "| ");
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_RULE_REF:
|
|
||||||
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR:
|
|
||||||
fprintf(file, "[");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
|
||||||
fprintf(file, "[^");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
|
|
||||||
std::to_string(rule_id) + "," + std::to_string(i));
|
|
||||||
}
|
|
||||||
fprintf(file, "-");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
|
|
||||||
std::to_string(rule_id) + "," + std::to_string(i));
|
|
||||||
}
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
fprintf(file, ".");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (is_char_element(elem)) {
|
|
||||||
switch (rule[i + 1].type) {
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
fprintf(file, "] ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_grammar(FILE * file, const parse_state & state) {
|
|
||||||
try {
|
|
||||||
std::map<uint32_t, std::string> symbol_id_names;
|
|
||||||
for (const auto & kv : state.symbol_ids) {
|
|
||||||
symbol_id_names[kv.second] = kv.first;
|
|
||||||
}
|
|
||||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
|
||||||
// fprintf(file, "%zu: ", i);
|
|
||||||
// print_rule_binary(file, state.rules[i]);
|
|
||||||
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
|
|
||||||
// fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
} catch (const std::exception & err) {
|
|
||||||
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> parse_state::c_rules() {
|
|
||||||
std::vector<const llama_grammar_element *> ret;
|
|
||||||
ret.reserve(rules.size());
|
|
||||||
for (const auto & rule : rules) {
|
|
||||||
ret.push_back(rule.data());
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,29 +0,0 @@
|
|||||||
// Implements a parser for an extended Backus-Naur form (BNF), producing the
|
|
||||||
// binary context-free grammar format specified by llama.h. Supports character
|
|
||||||
// ranges, grouping, and repetition operators. As an example, a grammar for
|
|
||||||
// arithmetic might look like:
|
|
||||||
//
|
|
||||||
// root ::= expr
|
|
||||||
// expr ::= term ([-+*/] term)*
|
|
||||||
// term ::= num | "(" space expr ")" space
|
|
||||||
// num ::= [0-9]+ space
|
|
||||||
// space ::= [ \t\n]*
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "llama.h"
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace grammar_parser {
|
|
||||||
struct parse_state {
|
|
||||||
std::map<std::string, uint32_t> symbol_ids;
|
|
||||||
std::vector<std::vector<llama_grammar_element>> rules;
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> c_rules();
|
|
||||||
};
|
|
||||||
|
|
||||||
parse_state parse(const char * src);
|
|
||||||
void print_grammar(FILE * file, const parse_state & state);
|
|
||||||
}
|
|
@ -1,460 +1,443 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include <random>
|
|
||||||
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
#include "common.h"
|
||||||
struct llama_sampling_context * result = new llama_sampling_context();
|
|
||||||
|
|
||||||
result->params = params;
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
result->grammar = nullptr;
|
// TODO: deduplicate with llama-impl.h
|
||||||
|
template<typename T>
|
||||||
|
struct ring_buffer {
|
||||||
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||||
|
|
||||||
// if there is a grammar, parse it
|
T & front() {
|
||||||
if (!params.grammar.empty()) {
|
if (sz == 0) {
|
||||||
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
// will be empty (default) if there are parse errors
|
const T & front() const {
|
||||||
if (result->parsed_grammar.rules.empty()) {
|
if (sz == 0) {
|
||||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
throw std::runtime_error("ring buffer is empty");
|
||||||
delete result;
|
}
|
||||||
return nullptr;
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
|
T & back() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & back() const {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
void push_back(const T & value) {
|
||||||
|
if (sz == capacity) {
|
||||||
|
// advance the start when buffer is full
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
} else {
|
||||||
|
sz++;
|
||||||
|
}
|
||||||
|
data[pos] = value;
|
||||||
|
pos = (pos + 1) % capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
T pop_front() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
T value = data[first];
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
sz--;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & rat(size_t i) const {
|
||||||
|
if (i >= sz) {
|
||||||
|
throw std::runtime_error("ring buffer: index out of bounds");
|
||||||
|
}
|
||||||
|
return data[(first + sz - i - 1) % capacity];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<T> to_vector() const {
|
||||||
|
std::vector<T> result;
|
||||||
|
result.reserve(sz);
|
||||||
|
for (size_t i = 0; i < sz; i++) {
|
||||||
|
result.push_back(data[(first + i) % capacity]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear() {
|
||||||
|
// here only reset the status of the buffer
|
||||||
|
sz = 0;
|
||||||
|
first = 0;
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool empty() const {
|
||||||
|
return sz == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t capacity = 0;
|
||||||
|
size_t sz = 0;
|
||||||
|
size_t first = 0;
|
||||||
|
size_t pos = 0;
|
||||||
|
std::vector<T> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_sampler {
|
||||||
|
gpt_sampler_params params;
|
||||||
|
|
||||||
|
struct llama_sampler * grmr;
|
||||||
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
|
std::vector<llama_token_data> cur;
|
||||||
|
|
||||||
|
llama_token_data_array cur_p;
|
||||||
|
|
||||||
|
void set_logits(struct llama_context * ctx, int idx) {
|
||||||
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
|
cur.resize(n_vocab);
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that there is a "root" node.
|
cur_p = { cur.data(), cur.size(), -1, false };
|
||||||
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
|
|
||||||
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
|
||||||
delete result;
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
struct llama_grammar * grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
||||||
}
|
|
||||||
result->grammar = grammar;
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
std::string gpt_sampler_params::print() const {
|
||||||
|
|
||||||
result->n_valid = 0;
|
|
||||||
|
|
||||||
llama_sampling_set_rng_seed(result, params.seed);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
|
||||||
if (ctx->grammar != NULL) {
|
|
||||||
llama_grammar_free(ctx->grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx) {
|
|
||||||
if (ctx->grammar != NULL) {
|
|
||||||
llama_grammar_free(ctx->grammar);
|
|
||||||
ctx->grammar = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
struct llama_grammar * grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
||||||
}
|
|
||||||
ctx->grammar = grammar;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
|
||||||
ctx->cur.clear();
|
|
||||||
ctx->n_valid = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
|
||||||
if (seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
seed = std::random_device{}();
|
|
||||||
}
|
|
||||||
ctx->rng.seed(seed);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
|
||||||
if (dst->grammar) {
|
|
||||||
llama_grammar_free(dst->grammar);
|
|
||||||
dst->grammar = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src->grammar) {
|
|
||||||
dst->grammar = llama_grammar_copy(src->grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
dst->prev = src->prev;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
|
||||||
return ctx->prev.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
|
||||||
const int size = ctx_sampling->prev.size();
|
|
||||||
|
|
||||||
n = std::min(n, size);
|
|
||||||
|
|
||||||
std::string result;
|
|
||||||
|
|
||||||
for (int i = size - n; i < size; i++) {
|
|
||||||
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_sampling_print(const llama_sampling_params & params) {
|
|
||||||
char result[1024];
|
char result[1024];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
mirostat, mirostat_eta, mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
if (params.mirostat == 0) {
|
|
||||||
for (auto sampler_type : params.samplers_sequence) {
|
lparams.no_perf = false; // TODO: control via params
|
||||||
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
|
||||||
if (!sampler_type_name.empty()) {
|
auto * result = new gpt_sampler {
|
||||||
result += "-> " + sampler_type_name + " ";
|
/* .params = */ params,
|
||||||
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||||
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_logit_bias(
|
||||||
|
llama_n_vocab(model),
|
||||||
|
params.logit_bias.size(),
|
||||||
|
params.logit_bias.data()));
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_penalties(
|
||||||
|
llama_n_vocab (model),
|
||||||
|
llama_token_eos(model),
|
||||||
|
llama_token_nl (model),
|
||||||
|
params.penalty_last_n,
|
||||||
|
params.penalty_repeat,
|
||||||
|
params.penalty_freq,
|
||||||
|
params.penalty_present,
|
||||||
|
params.penalize_nl,
|
||||||
|
params.ignore_eos));
|
||||||
|
|
||||||
|
if (params.temp > 0.0f) {
|
||||||
|
if (params.mirostat == 0) {
|
||||||
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
|
} else if (params.mirostat == 1) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
|
} else if (params.mirostat == 2) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result += "-> mirostat ";
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
||||||
switch (sampler_type) {
|
if (gsmpl) {
|
||||||
case llama_sampler_type::TOP_K: return "top_k";
|
llama_sampler_free(gsmpl->grmr);
|
||||||
case llama_sampler_type::TFS_Z: return "tfs_z";
|
|
||||||
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
llama_sampler_free(gsmpl->chain);
|
||||||
case llama_sampler_type::TOP_P: return "top_p";
|
|
||||||
case llama_sampler_type::MIN_P: return "min_p";
|
delete gsmpl;
|
||||||
case llama_sampler_type::TEMPERATURE: return "temperature";
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(gsmpl->grmr, token);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
|
||||||
|
gsmpl->prev.push_back(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
|
||||||
|
llama_sampler_reset(gsmpl->grmr);
|
||||||
|
|
||||||
|
llama_sampler_reset(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
||||||
|
return new gpt_sampler {
|
||||||
|
/* .params = */ gsmpl->params,
|
||||||
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||||
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .prev = */ gsmpl->prev,
|
||||||
|
/* .cur = */ gsmpl->cur,
|
||||||
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
||||||
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
|
if (gsmpl) {
|
||||||
|
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
}
|
||||||
|
if (ctx) {
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
auto & grmr = gsmpl->grmr;
|
||||||
|
auto & chain = gsmpl->chain;
|
||||||
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if it the sampled token fits the grammar
|
||||||
|
{
|
||||||
|
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &single_token_data_array);
|
||||||
|
|
||||||
|
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
if (is_valid) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resampling:
|
||||||
|
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
return cur_p.data[cur_p.selected].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||||
|
return &gsmpl->cur_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||||
|
return gsmpl->prev.rat(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
|
std::string result = "\tlogits ";
|
||||||
|
|
||||||
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||||
|
n = std::min(n, (int) gsmpl->prev.size());
|
||||||
|
|
||||||
|
if (n <= 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||||
|
|
||||||
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
|
const llama_token id = gsmpl->prev.rat(i);
|
||||||
|
|
||||||
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||||
|
|
||||||
|
result += llama_token_to_piece(ctx_main, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
|
default : return '?';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
|
||||||
{"top_k", llama_sampler_type::TOP_K},
|
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{"top_p", llama_sampler_type::TOP_P},
|
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"typical_p", llama_sampler_type::TYPICAL_P},
|
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"min_p", llama_sampler_type::MIN_P},
|
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{"tfs_z", llama_sampler_type::TFS_Z},
|
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
{"temperature", llama_sampler_type::TEMPERATURE}
|
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
|
||||||
{"top-k", llama_sampler_type::TOP_K},
|
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{"top-p", llama_sampler_type::TOP_P},
|
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"nucleus", llama_sampler_type::TOP_P},
|
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"typical-p", llama_sampler_type::TYPICAL_P},
|
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"typical", llama_sampler_type::TYPICAL_P},
|
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"min-p", llama_sampler_type::MIN_P},
|
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"tfs-z", llama_sampler_type::TFS_Z},
|
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"tfs", llama_sampler_type::TFS_Z},
|
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{"temp", llama_sampler_type::TEMPERATURE}
|
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types;
|
std::vector<gpt_sampler_type> samplers;
|
||||||
sampler_types.reserve(names.size());
|
samplers.reserve(names.size());
|
||||||
for (const auto & name : names)
|
|
||||||
{
|
|
||||||
auto sampler_item = sampler_canonical_name_map.find(name);
|
|
||||||
if (sampler_item != sampler_canonical_name_map.end())
|
|
||||||
{
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (allow_alt_names)
|
|
||||||
{
|
|
||||||
sampler_item = sampler_alt_name_map.find(name);
|
|
||||||
if (sampler_item != sampler_alt_name_map.end())
|
|
||||||
{
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sampler_types;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
for (const auto & name : names) {
|
||||||
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
{'k', llama_sampler_type::TOP_K},
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
{'p', llama_sampler_type::TOP_P},
|
samplers.push_back(sampler->second);
|
||||||
{'y', llama_sampler_type::TYPICAL_P},
|
|
||||||
{'m', llama_sampler_type::MIN_P},
|
|
||||||
{'f', llama_sampler_type::TFS_Z},
|
|
||||||
{'t', llama_sampler_type::TEMPERATURE}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types;
|
|
||||||
sampler_types.reserve(names_string.size());
|
|
||||||
for (const auto & c : names_string) {
|
|
||||||
const auto sampler_item = sampler_name_map.find(c);
|
|
||||||
if (sampler_item != sampler_name_map.end()) {
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sampler_types;
|
|
||||||
}
|
|
||||||
|
|
||||||
// no reasons to expose this function in header
|
|
||||||
static void sampler_queue(
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
const llama_sampling_params & params,
|
|
||||||
llama_token_data_array & cur_p,
|
|
||||||
size_t min_keep) {
|
|
||||||
const float temp = params.temp;
|
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
|
||||||
const int32_t top_k = params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float min_p = params.min_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
|
||||||
|
|
||||||
for (auto sampler_type : samplers_sequence) {
|
|
||||||
switch (sampler_type) {
|
|
||||||
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
|
||||||
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
|
||||||
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
|
||||||
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
|
||||||
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
|
||||||
case llama_sampler_type::TEMPERATURE:
|
|
||||||
if (dynatemp_range > 0) {
|
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
|
||||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
|
||||||
} else {
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default : break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_token llama_sampling_sample_impl(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool is_resampling) {
|
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
|
|
||||||
std::vector<float> original_logits;
|
|
||||||
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
|
||||||
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
||||||
GGML_ASSERT(!original_logits.empty());
|
|
||||||
}
|
|
||||||
llama_token id = 0;
|
|
||||||
|
|
||||||
if (temp < 0.0) {
|
|
||||||
// greedy sampling, with probs
|
|
||||||
llama_sample_softmax(ctx_main, &cur_p);
|
|
||||||
id = cur_p.data[0].id;
|
|
||||||
} else if (temp == 0.0) {
|
|
||||||
// greedy sampling, no probs
|
|
||||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
|
||||||
} else {
|
} else {
|
||||||
// temperature sampling
|
if (allow_alt_names) {
|
||||||
size_t min_keep = std::max(1, params.min_keep);
|
sampler = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
samplers.push_back(sampler->second);
|
||||||
|
|
||||||
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
|
||||||
|
|
||||||
//{
|
|
||||||
// const int n_top = 10;
|
|
||||||
// LOG("top %d candidates:\n", n_top);
|
|
||||||
|
|
||||||
// for (int i = 0; i < n_top; i++) {
|
|
||||||
// const llama_token id = cur_p.data[i].id;
|
|
||||||
// (void)id; // To avoid a warning that id is unused when logging is disabled.
|
|
||||||
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
||||||
// Get a pointer to the logits
|
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
||||||
|
|
||||||
// Create an array with a single token data element for the sampled id
|
|
||||||
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
|
||||||
|
|
||||||
// Apply grammar constraints to the single token
|
|
||||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
|
|
||||||
|
|
||||||
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
|
||||||
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
|
|
||||||
// If the token is not valid according to the grammar, perform resampling
|
|
||||||
if (!is_valid) {
|
|
||||||
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
|
||||||
|
|
||||||
// Restore logits from the copy
|
|
||||||
std::copy(original_logits.begin(), original_logits.end(), logits);
|
|
||||||
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_token_data_array llama_sampling_prepare_impl(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool apply_grammar,
|
|
||||||
std::vector<float> * original_logits) {
|
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
|
||||||
|
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
|
||||||
const float penalty_repeat = params.penalty_repeat;
|
|
||||||
const float penalty_freq = params.penalty_freq;
|
|
||||||
const float penalty_present = params.penalty_present;
|
|
||||||
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
|
|
||||||
auto & prev = ctx_sampling->prev;
|
|
||||||
auto & cur = ctx_sampling->cur;
|
|
||||||
|
|
||||||
// Get a pointer to the logits
|
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
|
||||||
GGML_ASSERT(original_logits != NULL);
|
|
||||||
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
|
||||||
*original_logits = {logits, logits + n_vocab};
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply params.logit_bias map
|
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx_cfg) {
|
|
||||||
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
|
||||||
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur.resize(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
|
||||||
|
|
||||||
// apply penalties
|
|
||||||
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
|
||||||
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
|
||||||
if (penalty_tokens_used_size) {
|
|
||||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
|
||||||
|
|
||||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
|
||||||
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
|
||||||
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
|
||||||
|
|
||||||
if (!penalize_nl) {
|
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
|
||||||
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
|
||||||
cur_p.data[idx].logit = nl_logit;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply grammar checks before sampling logic
|
return samplers;
|
||||||
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
}
|
||||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
|
|
||||||
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||||
|
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> samplers;
|
||||||
|
samplers.reserve(chars.size());
|
||||||
|
|
||||||
|
for (const auto & c : chars) {
|
||||||
|
const auto sampler = sampler_name_map.find(c);
|
||||||
|
if (sampler != sampler_name_map.end()) {
|
||||||
|
samplers.push_back(sampler->second);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur_p;
|
return samplers;
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx) {
|
|
||||||
// Call the implementation function with is_resampling set to false by default
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array llama_sampling_prepare(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool apply_grammar,
|
|
||||||
std::vector<float> * original_logits) {
|
|
||||||
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_accept(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
llama_token id,
|
|
||||||
bool apply_grammar) {
|
|
||||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
|
||||||
ctx_sampling->prev.push_back(id);
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
|
||||||
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -2,159 +2,130 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <random>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// sampler types
|
enum gpt_sampler_type {
|
||||||
enum class llama_sampler_type : char {
|
GPT_SAMPLER_TYPE_NONE = 0,
|
||||||
TOP_K = 'k',
|
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||||
TOP_P = 'p',
|
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||||
MIN_P = 'm',
|
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||||
TFS_Z = 'f',
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
TYPICAL_P = 'y',
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
TEMPERATURE = 't'
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
typedef struct llama_sampling_params {
|
struct gpt_sampler_params {
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> samplers_sequence = {
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
llama_sampler_type::TOP_K,
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
llama_sampler_type::TFS_Z,
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
llama_sampler_type::TYPICAL_P,
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
llama_sampler_type::TOP_P,
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
llama_sampler_type::MIN_P,
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
llama_sampler_type::TEMPERATURE
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
bool ignore_eos = false;
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
// Classifier-Free Guidance
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
// https://arxiv.org/abs/2306.17806
|
|
||||||
std::string cfg_negative_prompt; // string to help guidance
|
|
||||||
float cfg_scale = 1.f; // how strong is guidance
|
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
// print the parameters into a string
|
||||||
|
std::string print() const;
|
||||||
std::vector<llama_token> penalty_prompt_tokens;
|
|
||||||
bool use_penalty_prompt_tokens = false;
|
|
||||||
} llama_sampling_params;
|
|
||||||
|
|
||||||
// general sampler context
|
|
||||||
// TODO: move to llama.h
|
|
||||||
struct llama_sampling_context {
|
|
||||||
// parameters that will be used for sampling
|
|
||||||
llama_sampling_params params;
|
|
||||||
|
|
||||||
// mirostat sampler state
|
|
||||||
float mirostat_mu;
|
|
||||||
|
|
||||||
llama_grammar * grammar;
|
|
||||||
|
|
||||||
// internal
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
|
||||||
|
|
||||||
// TODO: replace with ring-buffer
|
|
||||||
std::vector<llama_token> prev;
|
|
||||||
std::vector<llama_token_data> cur;
|
|
||||||
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
|
||||||
|
|
||||||
std::mt19937 rng;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#include "common.h"
|
// gpt_sampler extends llama_sampler with additional functionality:
|
||||||
|
|
||||||
// Create a new sampling context instance.
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
|
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
|
||||||
|
|
||||||
// Reset the sampler context
|
|
||||||
// - clear prev tokens
|
|
||||||
// - reset grammar
|
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx);
|
|
||||||
|
|
||||||
// Set the sampler seed
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
|
||||||
|
|
||||||
// Copy the sampler context
|
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
|
||||||
|
|
||||||
// Get the last sampled token
|
|
||||||
llama_token llama_sampling_last(llama_sampling_context * ctx);
|
|
||||||
|
|
||||||
// Get a string representation of the last sampled tokens
|
|
||||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
|
|
||||||
|
|
||||||
// Print sampling parameters into a string
|
|
||||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
|
||||||
|
|
||||||
// Print sampling order into a string
|
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
|
||||||
|
|
||||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
|
||||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
|
||||||
// llama_sampling_reset when a sequence ends
|
|
||||||
//
|
//
|
||||||
// required:
|
// - grammar support
|
||||||
// - ctx_main: context to use for sampling
|
// - custom sampler logic based on the parameters
|
||||||
// - ctx_sampling: sampling-specific context
|
// - history of the last accepted tokens
|
||||||
|
// - performance metrics
|
||||||
//
|
//
|
||||||
// optional:
|
// This goal is to have a common implementation of the sampling logic shared across the examples.
|
||||||
// - ctx_cfg: context to use for classifier-free guidance
|
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
|
||||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
// complex (top-k, top-p, etc).
|
||||||
//
|
//
|
||||||
// returns:
|
// Another example is related to the grammar. In general, the grammar constraints applied on the full
|
||||||
// - token: sampled token
|
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
|
||||||
// - candidates: vector of candidate tokens
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
||||||
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||||
|
//
|
||||||
|
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||||
|
// be moved into the core llama library.
|
||||||
|
//
|
||||||
|
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
|
||||||
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||||
|
//
|
||||||
|
// TODO: measure grammar performance
|
||||||
//
|
//
|
||||||
llama_token llama_sampling_sample(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
int idx = -1);
|
|
||||||
|
|
||||||
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
struct gpt_sampler;
|
||||||
llama_token_data_array llama_sampling_prepare(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
int idx = 0,
|
|
||||||
bool apply_grammar = true,
|
|
||||||
std::vector<float> * original_logits = nullptr);
|
|
||||||
|
|
||||||
void llama_sampling_accept(
|
// llama_sampler API overloads
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
||||||
llama_token id,
|
|
||||||
bool apply_grammar);
|
void gpt_sampler_free(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||||
|
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
||||||
|
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// arguments can be nullptr to skip printing
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// extended sampling implementation:
|
||||||
|
//
|
||||||
|
// - set logits
|
||||||
|
// - apply the configured sampler chain
|
||||||
|
// - check if the token fits the grammar (if any)
|
||||||
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
|
//
|
||||||
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||||
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||||
|
//
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
// access the internal list of current candidate tokens
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get the last accepted token
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// print the sampler chain into a string
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get a string representation of the last accepted tokens
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
|
||||||
|
11662
common/stb_image.h
11662
common/stb_image.h
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
@ -63,6 +64,7 @@ class Model:
|
|||||||
model_name: str | None
|
model_name: str | None
|
||||||
metadata_override: Path | None
|
metadata_override: Path | None
|
||||||
dir_model_card: Path
|
dir_model_card: Path
|
||||||
|
is_lora: bool
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
@ -70,7 +72,7 @@ class Model:
|
|||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
|
|
||||||
@ -92,6 +94,7 @@ class Model:
|
|||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
@ -295,12 +298,30 @@ class Model:
|
|||||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||||
gguf.MODEL_TENSOR.POS_EMBD,
|
gguf.MODEL_TENSOR.POS_EMBD,
|
||||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
or not name.endswith(".weight")
|
or not new_name.endswith(".weight")
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
if data_qtype is False and any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if self.ftype in (
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
):
|
||||||
|
# TODO: use Q4_K and Q6_K
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
@ -311,6 +332,10 @@ class Model:
|
|||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
@ -1569,7 +1594,7 @@ class LlamaModel(Model):
|
|||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
@ -1592,7 +1617,8 @@ class LlamaModel(Model):
|
|||||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
|
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||||
|
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
@ -1615,15 +1641,16 @@ class BitnetModel(Model):
|
|||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
def weight_quant(self, weight):
|
def weight_quant(self, weight: Tensor) -> Tensor:
|
||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
scale = weight.abs().mean().clamp(min=1e-5)
|
||||||
weight = (weight * s).round().clamp(-1, 1) / s
|
iscale = 1 / scale
|
||||||
scale = weight.abs().max().unsqueeze(0)
|
# TODO: multiply by the scale directly instead of inverting it twice
|
||||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
# (this is also unnecessarily doubly inverted upstream)
|
||||||
weight = torch.sign(weight).type(dtype)
|
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
||||||
return weight.type(dtype), scale.type(torch.float32)
|
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
||||||
|
return result.type(dtype)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
@ -1638,11 +1665,9 @@ class BitnetModel(Model):
|
|||||||
gguf.MODEL_TENSOR.FFN_GATE,
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
]):
|
]):
|
||||||
# transform weight into 1/0/-1 (in fp32)
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
data_torch = self.weight_quant(data_torch)
|
||||||
yield (new_name, weight_torch)
|
|
||||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
yield (new_name, data_torch)
|
||||||
else:
|
|
||||||
yield (new_name, data_torch)
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
@ -2139,8 +2164,9 @@ class Phi3MiniModel(Model):
|
|||||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PlamoForCausalLM")
|
@Model.register("PlamoForCausalLM")
|
||||||
@ -2711,7 +2737,85 @@ class StarCoder2Model(Model):
|
|||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
||||||
|
|
||||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel")
|
@Model.register("Rwkv6ForCausalLM")
|
||||||
|
class Rwkv6Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.RWKV6
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
||||||
|
vocab_size = self.hparams.get("vocab_size", 65536)
|
||||||
|
|
||||||
|
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
||||||
|
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
||||||
|
|
||||||
|
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split(' ')
|
||||||
|
assert len(parts) >= 3
|
||||||
|
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
||||||
|
token = token.encode("utf-8") if isinstance(token, str) else token
|
||||||
|
assert isinstance(token, bytes)
|
||||||
|
assert len(token) == token_len
|
||||||
|
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
||||||
|
tokens.append(token_text.encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
remainder = vocab_size - len(tokens)
|
||||||
|
assert remainder >= 0
|
||||||
|
for i in range(len(tokens), vocab_size):
|
||||||
|
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
head_size = self.hparams["head_size"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
||||||
|
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
||||||
|
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
||||||
|
|
||||||
|
# RWKV isn't context limited
|
||||||
|
self.gguf_writer.add_context_length(1048576)
|
||||||
|
self.gguf_writer.add_embedding_length(hidden_size)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
||||||
|
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
||||||
|
self.gguf_writer.add_wkv_head_size(head_size)
|
||||||
|
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
||||||
|
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
||||||
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
# required by llama.cpp, unused
|
||||||
|
self.gguf_writer.add_head_count(0)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
||||||
|
new_name += ".weight"
|
||||||
|
|
||||||
|
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
||||||
|
data_torch = data_torch.transpose(0, 1)
|
||||||
|
|
||||||
|
if new_name.endswith("time_mix_w2.weight"):
|
||||||
|
data_torch = data_torch.permute(0, 2, 1)
|
||||||
|
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
if rescale_every_n_layers > 0:
|
||||||
|
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
||||||
|
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
||||||
|
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
||||||
class MambaModel(Model):
|
class MambaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||||
|
|
||||||
@ -2742,7 +2846,10 @@ class MambaModel(Model):
|
|||||||
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
# ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
|
||||||
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
|
||||||
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
|
||||||
|
use_dt_b_c_norm = False
|
||||||
|
# For falconmamba we do apply RMS norm on B / DT and C layers
|
||||||
|
if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
|
||||||
|
use_dt_b_c_norm = True
|
||||||
# Fail early for models which don't have a block expansion factor of 2
|
# Fail early for models which don't have a block expansion factor of 2
|
||||||
assert d_inner == 2 * d_model
|
assert d_inner == 2 * d_model
|
||||||
|
|
||||||
@ -2750,12 +2857,13 @@ class MambaModel(Model):
|
|||||||
self.gguf_writer.add_embedding_length(d_model)
|
self.gguf_writer.add_embedding_length(d_model)
|
||||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||||
self.gguf_writer.add_ssm_inner_size(d_inner)
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||||
self.gguf_writer.add_ssm_state_size(d_state)
|
self.gguf_writer.add_ssm_state_size(d_state)
|
||||||
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||||
|
self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) # For classic Mamba we don't apply rms norm on B / DT layers
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
_tok_embd = None
|
_tok_embd = None
|
||||||
@ -2782,23 +2890,6 @@ class MambaModel(Model):
|
|||||||
|
|
||||||
return [(new_name, data_torch)]
|
return [(new_name, data_torch)]
|
||||||
|
|
||||||
def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
|
|
||||||
if bid is not None and new_name in (
|
|
||||||
self.format_tensor_name(
|
|
||||||
n, bid, ".weight" if name.endswith(".weight") else ""
|
|
||||||
)
|
|
||||||
for n in [
|
|
||||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
|
||||||
gguf.MODEL_TENSOR.SSM_X,
|
|
||||||
gguf.MODEL_TENSOR.SSM_DT,
|
|
||||||
gguf.MODEL_TENSOR.SSM_A,
|
|
||||||
gguf.MODEL_TENSOR.SSM_D,
|
|
||||||
]
|
|
||||||
):
|
|
||||||
return gguf.GGMLQuantizationType.F32
|
|
||||||
|
|
||||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("CohereForCausalLM")
|
@Model.register("CohereForCausalLM")
|
||||||
class CommandR2Model(Model):
|
class CommandR2Model(Model):
|
||||||
@ -3792,7 +3883,7 @@ class ExaoneModel(Model):
|
|||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
hparams = self.hparams
|
hparams = self.hparams
|
||||||
|
|
||||||
assert(hparams["activation_function"] == "silu")
|
assert (hparams["activation_function"] == "silu")
|
||||||
|
|
||||||
max_position_embeddings = hparams["max_position_embeddings"]
|
max_position_embeddings = hparams["max_position_embeddings"]
|
||||||
embed_dim = hparams["hidden_size"]
|
embed_dim = hparams["hidden_size"]
|
||||||
@ -3828,7 +3919,7 @@ class ExaoneModel(Model):
|
|||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
@ -3851,12 +3942,13 @@ class ExaoneModel(Model):
|
|||||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
|
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||||
|
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
|
||||||
|
|
||||||
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
# tree of lazy tensors
|
# tree of lazy tensors
|
||||||
class LazyTorchTensor(gguf.LazyBase):
|
class LazyTorchTensor(gguf.LazyBase):
|
||||||
@ -3936,8 +4028,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
@ -4024,6 +4116,8 @@ def main() -> None:
|
|||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -386,6 +386,7 @@ if __name__ == '__main__':
|
|||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
dir_lora_model=dir_lora,
|
dir_lora_model=dir_lora,
|
||||||
lora_alpha=alpha,
|
lora_alpha=alpha,
|
||||||
|
is_lora=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||||
|
|
||||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
|
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||||
|
|
||||||
@ -28,10 +28,6 @@
|
|||||||
|
|
||||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
||||||
|
|
||||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
The SYCL backend would be broken by some PRs due to no online CI.
|
The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
@ -45,6 +41,10 @@ The following release is verified with good quality:
|
|||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
|
||||||
|
- 2024.8
|
||||||
|
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||||
|
|
||||||
- 2024.5
|
- 2024.5
|
||||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||||
- Arch Linux is verified successfully.
|
- Arch Linux is verified successfully.
|
||||||
@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
|
|||||||
|
|
||||||
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
||||||
|
|
||||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
|
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
|
||||||
|
|
||||||
- **Adding support to Nvidia GPUs**
|
- **Adding support to Nvidia GPUs**
|
||||||
|
|
||||||
@ -255,8 +255,6 @@ or
|
|||||||
# Export relevant ENV variables
|
# Export relevant ENV variables
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
@ -338,12 +336,12 @@ Choose one of following methods to run.
|
|||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run_llama2.sh 0
|
./examples/sycl/run-llama2.sh 0
|
||||||
```
|
```
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run_llama2.sh
|
./examples/sycl/run-llama2.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
|
@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
|
|||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
|
|||||||
|
|
||||||
The defaults are:
|
The defaults are:
|
||||||
|
|
||||||
- `CUDA_VERSION` set to `11.7.1`
|
- `CUDA_VERSION` set to `12.6.0`
|
||||||
- `CUDA_DOCKER_ARCH` set to `all`
|
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
|
@ -49,3 +49,12 @@ There are 2 modes of operation:
|
|||||||
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
||||||
|
|
||||||
|
### JSONL output
|
||||||
|
|
||||||
|
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
|
||||||
|
```
|
||||||
|
@ -28,9 +28,7 @@ static std::vector<int> parse_list(char * p) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
@ -39,8 +37,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,12 +120,13 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
if (!params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("\n");
|
||||||
LOG_TEE("\n");
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
|
LOG_TEE("\n");
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
@ -195,12 +194,22 @@ int main(int argc, char ** argv) {
|
|||||||
const float speed_tg = pl*tg / t_tg;
|
const float speed_tg = pl*tg / t_tg;
|
||||||
const float speed = n_kv / t;
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
if(params.batched_bench_output_jsonl) {
|
||||||
|
LOG_TEE(
|
||||||
|
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||||
|
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||||
|
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||||
|
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
|
|||||||
print("Failed to load model")
|
print("Failed to load model")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
}
|
}
|
||||||
@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
|
|||||||
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
var context_params = llama_context_default_params()
|
var context_params = llama_context_default_params()
|
||||||
context_params.seed = 1234
|
|
||||||
context_params.n_ctx = n_kv_req
|
context_params.n_ctx = n_kv_req
|
||||||
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
context_params.n_threads = 8
|
context_params.n_threads = 8
|
||||||
@ -48,11 +46,26 @@ guard context != nil else {
|
|||||||
print("Failed to initialize context")
|
print("Failed to initialize context")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var sparams = llama_sampler_chain_default_params()
|
||||||
|
|
||||||
|
let smpl = llama_sampler_chain_init(sparams)
|
||||||
|
guard smpl != nil else {
|
||||||
|
print("Failed to initialize sampling")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
defer {
|
||||||
|
llama_sampler_free(smpl)
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
|
||||||
|
|
||||||
let n_ctx = llama_n_ctx(context)
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
@ -125,32 +138,9 @@ while n_cur <= n_len {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
var n_vocab = llama_n_vocab(model)
|
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
||||||
var logits = llama_get_logits_ith(context, i_batch[i])
|
|
||||||
|
|
||||||
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
llama_sampler_accept(smpl, new_token_id)
|
||||||
|
|
||||||
for token_id in 0 ..< n_vocab {
|
|
||||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
|
||||||
}
|
|
||||||
|
|
||||||
var candidates_p: llama_token_data_array = .init(
|
|
||||||
data: &candidates,
|
|
||||||
size: candidates.count,
|
|
||||||
sorted: false
|
|
||||||
)
|
|
||||||
|
|
||||||
let top_k: Int32 = 40
|
|
||||||
let top_p: Float = 0.9
|
|
||||||
let temp: Float = 0.4
|
|
||||||
|
|
||||||
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
|
||||||
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
|
||||||
llama_sample_temp(context, &candidates_p, temp)
|
|
||||||
|
|
||||||
let new_token_id = llama_sample_token(context, &candidates_p)
|
|
||||||
|
|
||||||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
@ -210,9 +200,10 @@ if n_parallel > 1 {
|
|||||||
|
|
||||||
let t_main_end = ggml_time_us()
|
let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||||
|
|
||||||
llama_print_timings(context)
|
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
||||||
|
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let utf8Count = text.utf8.count
|
let utf8Count = text.utf8.count
|
||||||
|
@ -2,14 +2,11 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
@ -21,8 +18,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,6 +62,15 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
@ -164,29 +170,9 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
||||||
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
llama_sampler_accept(smpl, new_token_id);
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
const int top_k = 40;
|
|
||||||
const float top_p = 0.9f;
|
|
||||||
const float temp = 0.4f;
|
|
||||||
|
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
|
||||||
llama_sample_temp (ctx, &candidates_p, temp);
|
|
||||||
|
|
||||||
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
|
|
||||||
|
|
||||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
@ -244,12 +230,15 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
|||||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
||||||
|
|
||||||
struct benchmark_params_struct {
|
struct benchmark_params_struct {
|
||||||
int32_t n_threads = 1;
|
int n_threads = 1;
|
||||||
int32_t n_iterations = 10;
|
int32_t n_iterations = 10;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -35,9 +35,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
@ -390,8 +388,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -486,8 +484,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (use_pca) {
|
if (use_pca) {
|
||||||
// run PCA
|
// run PCA
|
||||||
PCA::pca_params pca_params;
|
PCA::pca_params pca_params;
|
||||||
pca_params.n_threads = params.n_threads;
|
pca_params.n_threads = params.cpuparams.n_threads;
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
} else {
|
} else {
|
||||||
|
@ -79,8 +79,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,13 +90,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -313,8 +307,10 @@ int main(int argc, char ** argv) {
|
|||||||
if (notArray) fprintf(stdout, "\n}\n");
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -144,15 +144,13 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -391,9 +391,7 @@ struct lora_merge_ctx {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
||||||
printf("\nNOTE: output model is F16\n");
|
printf("\nNOTE: output model is F16\n");
|
||||||
@ -403,14 +401,14 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
g_verbose = (params.verbosity == 1);
|
g_verbose = (params.verbosity == 1);
|
||||||
try {
|
try {
|
||||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||||
ctx.run_merge();
|
ctx.run_merge();
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s\n", err.what());
|
fprintf(stderr, "%s\n", err.what());
|
||||||
|
@ -1,9 +1,5 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
#include "llama-grammar.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -12,29 +8,28 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
|
static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
|
||||||
auto decoded = decode_utf8(input_str, {});
|
const auto cpts = unicode_cpts_from_utf8(input_str);
|
||||||
const auto & code_points = decoded.first;
|
|
||||||
|
|
||||||
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
||||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
|
||||||
|
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (const auto & cpt : cpts) {
|
||||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
|
||||||
|
|
||||||
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
|
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
|
||||||
|
|
||||||
if (cur_stacks.empty()) {
|
if (stacks_cur.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
|
||||||
cur_stacks = prev_stacks;
|
stacks_cur = stacks_prev;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
++pos;
|
++pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & stack : cur_stacks) {
|
for (const auto & stack : stacks_cur) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -85,27 +80,7 @@ int main(int argc, char** argv) {
|
|||||||
grammar_str = buffer.str();
|
grammar_str = buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse the GBNF grammar
|
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
|
||||||
|
|
||||||
// will be empty (default) if there are parse errors
|
|
||||||
if (parsed_grammar.rules.empty()) {
|
|
||||||
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure that there is a "root" node.
|
|
||||||
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
|
|
||||||
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
// Create the LLAMA grammar
|
|
||||||
auto grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
if (grammar == nullptr) {
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
}
|
}
|
||||||
@ -122,7 +97,7 @@ int main(int argc, char** argv) {
|
|||||||
// Validate the input string against the grammar
|
// Validate the input string against the grammar
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
std::string error_msg;
|
std::string error_msg;
|
||||||
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
|
bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
|
||||||
|
|
||||||
if (is_valid) {
|
if (is_valid) {
|
||||||
fprintf(stdout, "Input string is valid according to the grammar.\n");
|
fprintf(stdout, "Input string is valid according to the grammar.\n");
|
||||||
@ -131,7 +106,7 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Clean up
|
// Clean up
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free_impl(grammar);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
5
examples/gen-docs/CMakeLists.txt
Normal file
5
examples/gen-docs/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
set(TARGET llama-gen-docs)
|
||||||
|
add_executable(${TARGET} gen-docs.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
51
examples/gen-docs/gen-docs.cpp
Normal file
51
examples/gen-docs/gen-docs.cpp
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Export usage message (-h) to markdown format
|
||||||
|
|
||||||
|
static void export_md(std::string fname, llama_example ex) {
|
||||||
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
auto options = gpt_params_parser_init(params, ex);
|
||||||
|
|
||||||
|
file << "| Argument | Explanation |\n";
|
||||||
|
file << "| -------- | ----------- |\n";
|
||||||
|
for (auto & opt : options) {
|
||||||
|
file << "| `";
|
||||||
|
// args
|
||||||
|
for (const auto & arg : opt.args) {
|
||||||
|
if (arg == opt.args.front()) {
|
||||||
|
file << arg;
|
||||||
|
if (opt.args.size() > 1) file << ", ";
|
||||||
|
} else {
|
||||||
|
file << arg << (arg != opt.args.back() ? ", " : "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// value hint
|
||||||
|
if (opt.value_hint) {
|
||||||
|
std::string md_value_hint(opt.value_hint);
|
||||||
|
string_replace_all(md_value_hint, "|", "\\|");
|
||||||
|
file << " " << md_value_hint;
|
||||||
|
}
|
||||||
|
if (opt.value_hint_2) {
|
||||||
|
std::string md_value_hint_2(opt.value_hint_2);
|
||||||
|
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||||
|
file << " " << md_value_hint_2;
|
||||||
|
}
|
||||||
|
// help text
|
||||||
|
std::string md_help(opt.help);
|
||||||
|
string_replace_all(md_help, "\n", "<br/>");
|
||||||
|
string_replace_all(md_help, "|", "\\|");
|
||||||
|
file << "` | " << md_help << " |\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int, char **) {
|
||||||
|
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
|
||||||
|
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -9,7 +9,7 @@
|
|||||||
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
||||||
std::vector<std::vector<float>> result;
|
std::vector<std::vector<float>> result;
|
||||||
|
|
||||||
const llama_model * mdl = llama_get_model(ctx);
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
@ -18,16 +18,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
|
|
||||||
const std::string input_string = instruction + sentences[i];
|
const std::string input_string = instruction + sentences[i];
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
|
std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
|
||||||
|
|
||||||
const int32_t n_toks = inputs.size();
|
const int32_t n_toks = inputs.size();
|
||||||
|
|
||||||
// GritLM seems to have EOS = ""
|
// GritLM seems to have EOS = ""
|
||||||
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
|
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
|
||||||
// inputs.push_back(llama_token_eos(mdl));
|
// inputs.push_back(llama_token_eos(model));
|
||||||
|
|
||||||
// we want to ignore instruction tokens for mean pooling
|
// we want to ignore instruction tokens for mean pooling
|
||||||
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
|
const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
|
||||||
|
|
||||||
#ifdef GRIT_DEBUG
|
#ifdef GRIT_DEBUG
|
||||||
// debug tokens - should be matching as referenced in the GritLM sample
|
// debug tokens - should be matching as referenced in the GritLM sample
|
||||||
@ -51,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
llama_decode(ctx, batch);
|
llama_decode(ctx, batch);
|
||||||
|
|
||||||
// get embedding dimensions
|
// get embedding dimensions
|
||||||
uint64_t n_embd = llama_n_embd(mdl);
|
uint64_t n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
// allocate embedding output
|
// allocate embedding output
|
||||||
std::vector<float> emb_unorm(n_embd, 0.0f);
|
std::vector<float> emb_unorm(n_embd, 0.0f);
|
||||||
@ -92,11 +92,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
|
static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
|
||||||
const llama_model * mdl = llama_get_model(ctx);
|
const llama_model * model = llama_get_model(ctx);
|
||||||
llama_token eos_token = llama_token_eos(mdl);
|
llama_token eos_token = llama_token_eos(model);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
llama_set_embeddings(ctx, false);
|
llama_set_embeddings(ctx, false);
|
||||||
@ -104,28 +104,25 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
|||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
|
||||||
int32_t i_current_token = 0;
|
int32_t i_current_token = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
llama_batch_clear(bat);
|
llama_batch_clear(bat);
|
||||||
auto n_inputs = (int32_t)inputs.size();
|
{
|
||||||
for (int32_t i = 0; i < n_inputs; i++) {
|
const int32_t n_inputs = inputs.size();
|
||||||
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
|
||||||
|
for (int32_t i = 0; i < n_inputs; i++) {
|
||||||
|
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inputs.clear();
|
inputs.clear();
|
||||||
|
|
||||||
llama_decode(ctx, bat);
|
llama_decode(ctx, bat);
|
||||||
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
|
|
||||||
|
|
||||||
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
|
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
||||||
auto n_candidates = (int32_t)candidates.size();
|
llama_sampler_accept(smpl, token);
|
||||||
for (int32_t token = 0; token < n_candidates; token++) {
|
|
||||||
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
|
|
||||||
}
|
|
||||||
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
if (token == eos_token) {
|
if (token == eos_token) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -157,8 +154,8 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,10 +164,18 @@ int main(int argc, char * argv[]) {
|
|||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create generation context
|
// create generation context
|
||||||
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
sparams.no_perf = false;
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
||||||
@ -191,7 +196,7 @@ int main(int argc, char * argv[]) {
|
|||||||
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
||||||
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(mdl);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
||||||
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
||||||
@ -208,11 +213,12 @@ int main(int argc, char * argv[]) {
|
|||||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||||
{
|
{
|
||||||
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
||||||
std::string response = generate(ctx, prompt, true);
|
std::string response = generate(ctx, smpl, prompt, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(mdl);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -15,9 +15,7 @@
|
|||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s \\\n"
|
LOG_TEE("\n %s \\\n"
|
||||||
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] [--verbosity 1] \\\n"
|
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] [--verbosity 1] \\\n"
|
||||||
@ -657,8 +655,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
params.verbosity = 1;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -731,7 +729,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -34,6 +33,7 @@
|
|||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
|
static gpt_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static gpt_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
@ -81,7 +81,7 @@ static void write_logfile(
|
|||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ static void sigint_handler(int signo) {
|
|||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(*g_ctx);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
@ -103,14 +103,15 @@ static void sigint_handler(int signo) {
|
|||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
llama_sampling_params & sparams = params.sparams;
|
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("infill", "log"));
|
log_set_target(log_filename_generator("infill", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
@ -156,26 +157,21 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
print_build_info();
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx;
|
llama_context * ctx = nullptr;
|
||||||
|
gpt_sampler * smpl = nullptr;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
@ -305,7 +301,7 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
@ -349,7 +345,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
while (n_remain != 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
@ -421,11 +417,11 @@ int main(int argc, char ** argv) {
|
|||||||
embd.clear();
|
embd.clear();
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
|
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -444,7 +440,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
@ -476,7 +472,7 @@ int main(int argc, char ** argv) {
|
|||||||
// if not currently processing queued inputs;
|
// if not currently processing queued inputs;
|
||||||
if ((int) embd_inp.size() <= n_consumed) {
|
if ((int) embd_inp.size() <= n_consumed) {
|
||||||
// deal with eot token in infill mode
|
// deal with eot token in infill mode
|
||||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||||
if (is_interacting && !params.interactive_first) {
|
if (is_interacting && !params.interactive_first) {
|
||||||
// print an eot token
|
// print an eot token
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
@ -542,7 +538,7 @@ int main(int argc, char ** argv) {
|
|||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found EOS token\n");
|
LOG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -615,7 +611,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
llama_sampling_reset(ctx_sampling);
|
gpt_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
@ -638,13 +634,14 @@ int main(int argc, char ** argv) {
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
|
|||||||
1. [Markdown](#markdown)
|
1. [Markdown](#markdown)
|
||||||
2. [CSV](#csv)
|
2. [CSV](#csv)
|
||||||
3. [JSON](#json)
|
3. [JSON](#json)
|
||||||
4. [SQL](#sql)
|
4. [JSONL](#jsonl)
|
||||||
|
5. [SQL](#sql)
|
||||||
|
|
||||||
## Syntax
|
## Syntax
|
||||||
|
|
||||||
@ -23,27 +24,34 @@ usage: ./llama-bench [options]
|
|||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help
|
-h, --help
|
||||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||||
-p, --n-prompt <n> (default: 512)
|
-p, --n-prompt <n> (default: 512)
|
||||||
-n, --n-gen <n> (default: 128)
|
-n, --n-gen <n> (default: 128)
|
||||||
-pg <pp,tg> (default: 512,128)
|
-pg <pp,tg> (default: )
|
||||||
-b, --batch-size <n> (default: 2048)
|
-b, --batch-size <n> (default: 2048)
|
||||||
-ub, --ubatch-size <n> (default: 512)
|
-ub, --ubatch-size <n> (default: 512)
|
||||||
-ctk, --cache-type-k <t> (default: f16)
|
-ctk, --cache-type-k <t> (default: f16)
|
||||||
-ctv, --cache-type-v <t> (default: f16)
|
-ctv, --cache-type-v <t> (default: f16)
|
||||||
-t, --threads <n> (default: 16)
|
-t, --threads <n> (default: 8)
|
||||||
-ngl, --n-gpu-layers <n> (default: 99)
|
-C, --cpu-mask <hex,hex> (default: 0x0)
|
||||||
-sm, --split-mode <none|layer|row> (default: layer)
|
--cpu-strict <0|1> (default: 0)
|
||||||
-mg, --main-gpu <i> (default: 0)
|
--poll <0...100> (default: 50)
|
||||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
-ngl, --n-gpu-layers <n> (default: 99)
|
||||||
-fa, --flash-attn <0|1> (default: 0)
|
-rpc, --rpc <rpc_servers> (default: )
|
||||||
-mmp, --mmap <0|1> (default: 1)
|
-sm, --split-mode <none|layer|row> (default: layer)
|
||||||
--numa <distribute|isolate|numactl> (default: disabled)
|
-mg, --main-gpu <i> (default: 0)
|
||||||
-embd, --embeddings <0|1> (default: 0)
|
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
-fa, --flash-attn <0|1> (default: 0)
|
||||||
-r, --repetitions <n> (default: 5)
|
-mmp, --mmap <0|1> (default: 1)
|
||||||
-o, --output <csv|json|md|sql> (default: md)
|
--numa <distribute|isolate|numactl> (default: disabled)
|
||||||
-v, --verbose (default: 0)
|
-embd, --embeddings <0|1> (default: 0)
|
||||||
|
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||||
|
-r, --repetitions <n> (default: 5)
|
||||||
|
--prio <0|1|2|3> (default: 0)
|
||||||
|
--delay <0...N> (seconds) (default: 0)
|
||||||
|
-o, --output <csv|json|jsonl|md|sql> (default: md)
|
||||||
|
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
|
||||||
|
-v, --verbose (default: 0)
|
||||||
|
|
||||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||||
```
|
```
|
||||||
@ -238,6 +246,19 @@ $ ./llama-bench -o json
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### JSONL
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ ./llama-bench -o jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
|
||||||
|
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### SQL
|
### SQL
|
||||||
|
|
||||||
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
|
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
@ -123,6 +124,9 @@ static std::string get_cpu_info() {
|
|||||||
(LPBYTE)cpu_brand,
|
(LPBYTE)cpu_brand,
|
||||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||||
id.assign(cpu_brand, cpu_brand_size);
|
id.assign(cpu_brand, cpu_brand_size);
|
||||||
|
if (id.find('\0') != std::string::npos) {
|
||||||
|
id.resize(id.find('\0'));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
RegCloseKey(hKey);
|
RegCloseKey(hKey);
|
||||||
#endif
|
#endif
|
||||||
@ -170,13 +174,14 @@ static std::string get_gpu_info() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// command line params
|
// command line params
|
||||||
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
||||||
|
|
||||||
static const char * output_format_str(output_formats format) {
|
static const char * output_format_str(output_formats format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case NONE: return "none";
|
case NONE: return "none";
|
||||||
case CSV: return "csv";
|
case CSV: return "csv";
|
||||||
case JSON: return "json";
|
case JSON: return "json";
|
||||||
|
case JSONL: return "jsonl";
|
||||||
case MARKDOWN: return "md";
|
case MARKDOWN: return "md";
|
||||||
case SQL: return "sql";
|
case SQL: return "sql";
|
||||||
default: GGML_ABORT("invalid output format");
|
default: GGML_ABORT("invalid output format");
|
||||||
@ -190,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
|||||||
format = CSV;
|
format = CSV;
|
||||||
} else if (s == "json") {
|
} else if (s == "json") {
|
||||||
format = JSON;
|
format = JSON;
|
||||||
|
} else if (s == "jsonl") {
|
||||||
|
format = JSONL;
|
||||||
} else if (s == "md") {
|
} else if (s == "md") {
|
||||||
format = MARKDOWN;
|
format = MARKDOWN;
|
||||||
} else if (s == "sql") {
|
} else if (s == "sql") {
|
||||||
@ -225,6 +232,9 @@ struct cmd_params {
|
|||||||
std::vector<ggml_type> type_k;
|
std::vector<ggml_type> type_k;
|
||||||
std::vector<ggml_type> type_v;
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
|
std::vector<std::string> cpu_mask;
|
||||||
|
std::vector<bool> cpu_strict;
|
||||||
|
std::vector<int> poll;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<std::string> rpc_servers;
|
std::vector<std::string> rpc_servers;
|
||||||
std::vector<llama_split_mode> split_mode;
|
std::vector<llama_split_mode> split_mode;
|
||||||
@ -236,7 +246,10 @@ struct cmd_params {
|
|||||||
std::vector<bool> embeddings;
|
std::vector<bool> embeddings;
|
||||||
ggml_numa_strategy numa;
|
ggml_numa_strategy numa;
|
||||||
int reps;
|
int reps;
|
||||||
|
ggml_sched_priority prio;
|
||||||
|
int delay;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
|
bool progress;
|
||||||
output_formats output_format;
|
output_formats output_format;
|
||||||
output_formats output_format_stderr;
|
output_formats output_format_stderr;
|
||||||
};
|
};
|
||||||
@ -251,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {cpu_get_num_math()},
|
/* n_threads */ {cpu_get_num_math()},
|
||||||
|
/* cpu_mask */ {"0x0"},
|
||||||
|
/* cpu_strict */ {false},
|
||||||
|
/* poll */ {50},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* rpc_servers */ {""},
|
/* rpc_servers */ {""},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
@ -262,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* embeddings */ {false},
|
/* embeddings */ {false},
|
||||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||||
|
/* delay */ 0,
|
||||||
/* verbose */ false,
|
/* verbose */ false,
|
||||||
|
/* progress */ false,
|
||||||
/* output_format */ MARKDOWN,
|
/* output_format */ MARKDOWN,
|
||||||
/* output_format_stderr */ NONE,
|
/* output_format_stderr */ NONE,
|
||||||
};
|
};
|
||||||
@ -272,29 +291,37 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help\n");
|
printf(" -h, --help\n");
|
||||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
||||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
#ifdef GGML_USE_RPC
|
||||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
#endif
|
||||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
|
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||||
|
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||||
|
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||||
|
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||||
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
|
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
}
|
}
|
||||||
@ -338,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
params.numa = cmd_params_defaults.numa;
|
params.numa = cmd_params_defaults.numa;
|
||||||
|
params.prio = cmd_params_defaults.prio;
|
||||||
|
params.delay = cmd_params_defaults.delay;
|
||||||
|
params.progress = cmd_params_defaults.progress;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
@ -433,6 +463,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<std::string>(argv[i], split_delim);
|
||||||
|
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--cpu-strict") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<bool>(argv[i], split_delim);
|
||||||
|
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--poll") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
|
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -440,12 +491,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
} else if (arg == "-rpc" || arg == "--rpc") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rpc_servers.push_back(argv[i]);
|
params.rpc_servers.push_back(argv[i]);
|
||||||
|
#endif
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -541,6 +594,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.reps = std::stoi(argv[i]);
|
params.reps = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--prio") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--delay") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.delay = std::stoi(argv[i]);
|
||||||
} else if (arg == "-o" || arg == "--output") {
|
} else if (arg == "-o" || arg == "--output") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -555,6 +620,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
||||||
} else if (arg == "-v" || arg == "--verbose") {
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
params.verbose = true;
|
params.verbose = true;
|
||||||
|
} else if (arg == "--progress") {
|
||||||
|
params.progress = true;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
@ -585,6 +652,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||||
|
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||||
|
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||||
|
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
@ -598,6 +668,9 @@ struct cmd_params_instance {
|
|||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
std::string rpc_servers;
|
std::string rpc_servers;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
@ -667,7 +740,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & nkvo : params.no_kv_offload)
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & fa : params.flash_attn)
|
for (const auto & fa : params.flash_attn)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads)
|
||||||
|
for (const auto & cm : params.cpu_mask)
|
||||||
|
for (const auto & cs : params.cpu_strict)
|
||||||
|
for (const auto & pl : params.poll) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
continue;
|
continue;
|
||||||
@ -681,6 +757,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -707,6 +786,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -733,6 +815,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -769,6 +854,9 @@ struct test {
|
|||||||
int n_batch;
|
int n_batch;
|
||||||
int n_ubatch;
|
int n_ubatch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
bool has_rpc;
|
bool has_rpc;
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
@ -795,6 +883,9 @@ struct test {
|
|||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_ubatch = inst.n_ubatch;
|
n_ubatch = inst.n_ubatch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
|
cpu_mask = inst.cpu_mask;
|
||||||
|
cpu_strict = inst.cpu_strict;
|
||||||
|
poll = inst.poll;
|
||||||
has_rpc = !inst.rpc_servers.empty();
|
has_rpc = !inst.rpc_servers.empty();
|
||||||
type_k = inst.type_k;
|
type_k = inst.type_k;
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
@ -872,13 +963,14 @@ struct test {
|
|||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
"n_threads", "type_k", "type_v",
|
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||||
|
"type_k", "type_v",
|
||||||
"n_gpu_layers", "split_mode",
|
"n_gpu_layers", "split_mode",
|
||||||
"main_gpu", "no_kv_offload", "flash_attn",
|
"main_gpu", "no_kv_offload", "flash_attn",
|
||||||
"tensor_split", "use_mmap", "embeddings",
|
"tensor_split", "use_mmap", "embeddings",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
"avg_ts", "stddev_ts"
|
"avg_ts", "stddev_ts",
|
||||||
};
|
};
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
@ -887,7 +979,7 @@ struct test {
|
|||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||||
field == "n_threads" ||
|
field == "n_threads" || field == "poll" ||
|
||||||
field == "model_size" || field == "model_n_params" ||
|
field == "model_size" || field == "model_n_params" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
field == "n_prompt" || field == "n_gen" ||
|
||||||
@ -896,6 +988,7 @@ struct test {
|
|||||||
}
|
}
|
||||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
|
field == "cpu_strict" ||
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
@ -928,7 +1021,8 @@ struct test {
|
|||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||||
|
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||||
@ -996,38 +1090,39 @@ struct csv_printer : public printer {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static std::string escape_json(const std::string & value) {
|
||||||
|
std::string escaped;
|
||||||
|
for (auto c : value) {
|
||||||
|
if (c == '"') {
|
||||||
|
escaped += "\\\"";
|
||||||
|
} else if (c == '\\') {
|
||||||
|
escaped += "\\\\";
|
||||||
|
} else if (c <= 0x1f) {
|
||||||
|
char buf[8];
|
||||||
|
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
||||||
|
escaped += buf;
|
||||||
|
} else {
|
||||||
|
escaped += c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return escaped;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string format_json_value(const std::string & field, const std::string & value) {
|
||||||
|
switch (test::get_field_type(field)) {
|
||||||
|
case test::STRING:
|
||||||
|
return "\"" + escape_json(value) + "\"";
|
||||||
|
case test::BOOL:
|
||||||
|
return value == "0" ? "false" : "true";
|
||||||
|
default:
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct json_printer : public printer {
|
struct json_printer : public printer {
|
||||||
bool first = true;
|
bool first = true;
|
||||||
|
|
||||||
static std::string escape_json(const std::string & value) {
|
|
||||||
std::string escaped;
|
|
||||||
for (auto c : value) {
|
|
||||||
if (c == '"') {
|
|
||||||
escaped += "\\\"";
|
|
||||||
} else if (c == '\\') {
|
|
||||||
escaped += "\\\\";
|
|
||||||
} else if (c <= 0x1f) {
|
|
||||||
char buf[8];
|
|
||||||
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
||||||
escaped += buf;
|
|
||||||
} else {
|
|
||||||
escaped += c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return escaped;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string format_value(const std::string & field, const std::string & value) {
|
|
||||||
switch (test::get_field_type(field)) {
|
|
||||||
case test::STRING:
|
|
||||||
return "\"" + escape_json(value) + "\"";
|
|
||||||
case test::BOOL:
|
|
||||||
return value == "0" ? "false" : "true";
|
|
||||||
default:
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_header(const cmd_params & params) override {
|
void print_header(const cmd_params & params) override {
|
||||||
fprintf(fout, "[\n");
|
fprintf(fout, "[\n");
|
||||||
(void) params;
|
(void) params;
|
||||||
@ -1036,7 +1131,7 @@ struct json_printer : public printer {
|
|||||||
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
assert(fields.size() == values.size());
|
assert(fields.size() == values.size());
|
||||||
for (size_t i = 0; i < fields.size(); i++) {
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
|
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1059,6 +1154,25 @@ struct json_printer : public printer {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct jsonl_printer : public printer {
|
||||||
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
|
assert(fields.size() == values.size());
|
||||||
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
|
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_test(const test & t) override {
|
||||||
|
fprintf(fout, "{");
|
||||||
|
print_fields(test::get_fields(), t.get_values());
|
||||||
|
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
|
||||||
|
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
|
||||||
|
fprintf(fout, "}\n");
|
||||||
|
fflush(fout);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct markdown_printer : public printer {
|
struct markdown_printer : public printer {
|
||||||
std::vector<std::string> fields;
|
std::vector<std::string> fields;
|
||||||
|
|
||||||
@ -1067,7 +1181,7 @@ struct markdown_printer : public printer {
|
|||||||
return -30;
|
return -30;
|
||||||
}
|
}
|
||||||
if (field == "t/s") {
|
if (field == "t/s") {
|
||||||
return 16;
|
return 20;
|
||||||
}
|
}
|
||||||
if (field == "size" || field == "params") {
|
if (field == "size" || field == "params") {
|
||||||
return 10;
|
return 10;
|
||||||
@ -1149,6 +1263,15 @@ struct markdown_printer : public printer {
|
|||||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||||
fields.emplace_back("n_threads");
|
fields.emplace_back("n_threads");
|
||||||
}
|
}
|
||||||
|
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||||
|
fields.emplace_back("cpu_mask");
|
||||||
|
}
|
||||||
|
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||||
|
fields.emplace_back("cpu_strict");
|
||||||
|
}
|
||||||
|
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||||
|
fields.emplace_back("poll");
|
||||||
|
}
|
||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.emplace_back("n_batch");
|
fields.emplace_back("n_batch");
|
||||||
}
|
}
|
||||||
@ -1350,6 +1473,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
|||||||
return std::unique_ptr<printer>(new csv_printer());
|
return std::unique_ptr<printer>(new csv_printer());
|
||||||
case JSON:
|
case JSON:
|
||||||
return std::unique_ptr<printer>(new json_printer());
|
return std::unique_ptr<printer>(new json_printer());
|
||||||
|
case JSONL:
|
||||||
|
return std::unique_ptr<printer>(new jsonl_printer());
|
||||||
case MARKDOWN:
|
case MARKDOWN:
|
||||||
return std::unique_ptr<printer>(new markdown_printer());
|
return std::unique_ptr<printer>(new markdown_printer());
|
||||||
case SQL:
|
case SQL:
|
||||||
@ -1383,6 +1508,8 @@ int main(int argc, char ** argv) {
|
|||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
set_process_priority(params.prio);
|
||||||
|
|
||||||
// initialize printer
|
// initialize printer
|
||||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||||
@ -1402,7 +1529,13 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * lmodel = nullptr;
|
llama_model * lmodel = nullptr;
|
||||||
const cmd_params_instance * prev_inst = nullptr;
|
const cmd_params_instance * prev_inst = nullptr;
|
||||||
|
|
||||||
|
int params_idx = 0;
|
||||||
|
auto params_count = params_instances.size();
|
||||||
for (const auto & inst : params_instances) {
|
for (const auto & inst : params_instances) {
|
||||||
|
params_idx ++;
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
||||||
|
}
|
||||||
// keep the same model between tests when possible
|
// keep the same model between tests when possible
|
||||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
if (lmodel) {
|
if (lmodel) {
|
||||||
@ -1428,12 +1561,40 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
// cool off before the test
|
||||||
|
if (params.delay) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||||
|
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||||
|
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
tpp.strict_cpu = t.cpu_strict;
|
||||||
|
tpp.poll = t.poll;
|
||||||
|
tpp.prio = params.prio;
|
||||||
|
|
||||||
|
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
||||||
|
}
|
||||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
||||||
|
}
|
||||||
test_gen(ctx, 1, 0, t.n_threads);
|
test_gen(ctx, 1, 0, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1443,9 +1604,15 @@ int main(int argc, char ** argv) {
|
|||||||
uint64_t t_start = get_time_ns();
|
uint64_t t_start = get_time_ns();
|
||||||
|
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
||||||
|
}
|
||||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
||||||
|
}
|
||||||
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1463,9 +1630,11 @@ int main(int argc, char ** argv) {
|
|||||||
fflush(p_err->fout);
|
fflush(p_err->fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
|
@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
|
|||||||
LOGi("Using %d threads", n_threads);
|
LOGi("Using %d threads", n_threads);
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = 2048;
|
ctx_params.n_ctx = 2048;
|
||||||
ctx_params.n_threads = n_threads;
|
ctx_params.n_threads = n_threads;
|
||||||
ctx_params.n_threads_batch = n_threads;
|
ctx_params.n_threads_batch = n_threads;
|
||||||
|
|
||||||
@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||||||
return env->NewStringUTF(result.str().c_str());
|
return env->NewStringUTF(result.str().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
|
||||||
JNIEXPORT void JNICALL
|
|
||||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
|
||||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||||
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
|||||||
return reinterpret_cast<jlong>(batch);
|
return reinterpret_cast<jlong>(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jlong JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
sparams.no_perf = true;
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
|
return reinterpret_cast<jlong>(smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
|
||||||
|
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||||
@ -381,31 +398,23 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
jlong batch_pointer,
|
jlong batch_pointer,
|
||||||
|
jlong sampler_pointer,
|
||||||
jint n_len,
|
jint n_len,
|
||||||
jobject intvar_ncur
|
jobject intvar_ncur
|
||||||
) {
|
) {
|
||||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
|
||||||
const auto model = llama_get_model(context);
|
const auto model = llama_get_model(context);
|
||||||
|
|
||||||
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||||
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||||
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// sample the most likely token
|
// sample the most likely token
|
||||||
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
||||||
|
|
||||||
|
llama_sampler_accept(sampler, new_token_id);
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
|
@ -45,8 +45,10 @@ class LLamaAndroid {
|
|||||||
private external fun free_context(context: Long)
|
private external fun free_context(context: Long)
|
||||||
private external fun backend_init(numa: Boolean)
|
private external fun backend_init(numa: Boolean)
|
||||||
private external fun backend_free()
|
private external fun backend_free()
|
||||||
private external fun free_batch(batch: Long)
|
|
||||||
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||||
|
private external fun free_batch(batch: Long)
|
||||||
|
private external fun new_sampler(): Long
|
||||||
|
private external fun free_sampler(sampler: Long)
|
||||||
private external fun bench_model(
|
private external fun bench_model(
|
||||||
context: Long,
|
context: Long,
|
||||||
model: Long,
|
model: Long,
|
||||||
@ -69,6 +71,7 @@ class LLamaAndroid {
|
|||||||
private external fun completion_loop(
|
private external fun completion_loop(
|
||||||
context: Long,
|
context: Long,
|
||||||
batch: Long,
|
batch: Long,
|
||||||
|
sampler: Long,
|
||||||
nLen: Int,
|
nLen: Int,
|
||||||
ncur: IntVar
|
ncur: IntVar
|
||||||
): String?
|
): String?
|
||||||
@ -101,8 +104,11 @@ class LLamaAndroid {
|
|||||||
val batch = new_batch(512, 0, 1)
|
val batch = new_batch(512, 0, 1)
|
||||||
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||||
|
|
||||||
|
val sampler = new_sampler()
|
||||||
|
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
|
||||||
|
|
||||||
Log.i(tag, "Loaded model $pathToModel")
|
Log.i(tag, "Loaded model $pathToModel")
|
||||||
threadLocalState.set(State.Loaded(model, context, batch))
|
threadLocalState.set(State.Loaded(model, context, batch, sampler))
|
||||||
}
|
}
|
||||||
else -> throw IllegalStateException("Model already loaded")
|
else -> throw IllegalStateException("Model already loaded")
|
||||||
}
|
}
|
||||||
@ -114,7 +120,7 @@ class LLamaAndroid {
|
|||||||
is State.Loaded -> {
|
is State.Loaded -> {
|
||||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||||
while (ncur.value <= nlen) {
|
while (ncur.value <= nlen) {
|
||||||
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
|
||||||
if (str == null) {
|
if (str == null) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -138,6 +144,7 @@ class LLamaAndroid {
|
|||||||
free_context(state.context)
|
free_context(state.context)
|
||||||
free_model(state.model)
|
free_model(state.model)
|
||||||
free_batch(state.batch)
|
free_batch(state.batch)
|
||||||
|
free_sampler(state.sampler);
|
||||||
|
|
||||||
threadLocalState.set(State.Idle)
|
threadLocalState.set(State.Idle)
|
||||||
}
|
}
|
||||||
@ -161,7 +168,7 @@ class LLamaAndroid {
|
|||||||
|
|
||||||
private sealed interface State {
|
private sealed interface State {
|
||||||
data object Idle: State
|
data object Idle: State
|
||||||
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enforce only one instance of Llm.
|
// Enforce only one instance of Llm.
|
||||||
|
@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
|
|||||||
actor LlamaContext {
|
actor LlamaContext {
|
||||||
private var model: OpaquePointer
|
private var model: OpaquePointer
|
||||||
private var context: OpaquePointer
|
private var context: OpaquePointer
|
||||||
|
private var sampling: UnsafeMutablePointer<llama_sampler>
|
||||||
private var batch: llama_batch
|
private var batch: llama_batch
|
||||||
private var tokens_list: [llama_token]
|
private var tokens_list: [llama_token]
|
||||||
var is_done: Bool = false
|
var is_done: Bool = false
|
||||||
@ -42,9 +43,15 @@ actor LlamaContext {
|
|||||||
self.tokens_list = []
|
self.tokens_list = []
|
||||||
self.batch = llama_batch_init(512, 0, 1)
|
self.batch = llama_batch_init(512, 0, 1)
|
||||||
self.temporary_invalid_cchars = []
|
self.temporary_invalid_cchars = []
|
||||||
|
let sparams = llama_sampler_chain_default_params()
|
||||||
|
self.sampling = llama_sampler_chain_init(sparams)
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
|
||||||
}
|
}
|
||||||
|
|
||||||
deinit {
|
deinit {
|
||||||
|
llama_sampler_free(sampling)
|
||||||
llama_batch_free(batch)
|
llama_batch_free(batch)
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
@ -69,10 +76,9 @@ actor LlamaContext {
|
|||||||
print("Using \(n_threads) threads")
|
print("Using \(n_threads) threads")
|
||||||
|
|
||||||
var ctx_params = llama_context_default_params()
|
var ctx_params = llama_context_default_params()
|
||||||
ctx_params.seed = 1234
|
|
||||||
ctx_params.n_ctx = 2048
|
ctx_params.n_ctx = 2048
|
||||||
ctx_params.n_threads = UInt32(n_threads)
|
ctx_params.n_threads = Int32(n_threads)
|
||||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
ctx_params.n_threads_batch = Int32(n_threads)
|
||||||
|
|
||||||
let context = llama_new_context_with_model(model, ctx_params)
|
let context = llama_new_context_with_model(model, ctx_params)
|
||||||
guard let context else {
|
guard let context else {
|
||||||
@ -144,20 +150,9 @@ actor LlamaContext {
|
|||||||
func completion_loop() -> String {
|
func completion_loop() -> String {
|
||||||
var new_token_id: llama_token = 0
|
var new_token_id: llama_token = 0
|
||||||
|
|
||||||
let n_vocab = llama_n_vocab(model)
|
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
||||||
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
|
||||||
|
|
||||||
var candidates = Array<llama_token_data>()
|
llama_sampler_accept(sampling, new_token_id)
|
||||||
candidates.reserveCapacity(Int(n_vocab))
|
|
||||||
|
|
||||||
for token_id in 0..<n_vocab {
|
|
||||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
|
||||||
}
|
|
||||||
candidates.withUnsafeMutableBufferPointer() { buffer in
|
|
||||||
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
|
|
||||||
|
|
||||||
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
|
||||||
}
|
|
||||||
|
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
|
@ -15,8 +15,8 @@ cd llama.cpp
|
|||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||||
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
|
@ -20,6 +20,10 @@
|
|||||||
#include "ggml-cann.h"
|
#include "ggml-cann.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#define STB_IMAGE_IMPLEMENTATION
|
#define STB_IMAGE_IMPLEMENTATION
|
||||||
#include "stb_image.h"
|
#include "stb_image.h"
|
||||||
|
|
||||||
@ -212,13 +216,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
if (search.empty()) {
|
if (search.empty()) {
|
||||||
return; // Avoid infinite loop if 'search' is an empty string
|
return;
|
||||||
}
|
}
|
||||||
|
std::string builder;
|
||||||
|
builder.reserve(s.length());
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
size_t last_pos = 0;
|
||||||
s.replace(pos, search.length(), replace);
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||||
pos += replace.length();
|
builder.append(s, last_pos, pos - last_pos);
|
||||||
|
builder.append(replace);
|
||||||
|
last_pos = pos + search.length();
|
||||||
}
|
}
|
||||||
|
builder.append(s, last_pos, std::string::npos);
|
||||||
|
s = std::move(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
@ -1108,7 +1118,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
clip_ctx * new_clip = new clip_ctx;
|
clip_ctx * new_clip = new clip_ctx{};
|
||||||
|
|
||||||
// update projector type
|
// update projector type
|
||||||
{
|
{
|
||||||
@ -1142,6 +1152,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
new_clip->backend = ggml_backend_vk_init(0);
|
||||||
|
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!new_clip->backend) {
|
if (!new_clip->backend) {
|
||||||
new_clip->backend = ggml_backend_cpu_init();
|
new_clip->backend = ggml_backend_cpu_init();
|
||||||
@ -1609,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline float clip(float x, float lower, float upper) {
|
inline int clip(int x, int lower, int upper) {
|
||||||
return std::max(lower, std::min(x, upper));
|
return std::max(lower, std::min(x, upper));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1813,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
|
|||||||
return refine_size;
|
return refine_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int clip(int x, int lower, int upper) {
|
|
||||||
return std::max(lower, std::min(x, upper));
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||||
std::vector<int> candidate_split_grids_nums;
|
std::vector<int> candidate_split_grids_nums;
|
||||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||||
|
@ -40,11 +40,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
static const char * sample(struct gpt_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
@ -112,9 +112,7 @@ struct llava_context {
|
|||||||
struct llama_model * model = NULL;
|
struct llama_model * model = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\n example usage:\n");
|
LOG_TEE("\n example usage:\n");
|
||||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
@ -129,14 +127,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||||||
if (!params->image.empty()) {
|
if (!params->image.empty()) {
|
||||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||||
}
|
}
|
||||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
} else {
|
} else {
|
||||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -191,15 +189,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
if (!ctx_sampling) {
|
if (!smpl) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string response = "";
|
std::string response = "";
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
@ -211,7 +209,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -280,8 +278,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,7 +291,7 @@ int main(int argc, char ** argv) {
|
|||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
print_usage(argc, argv, {});
|
print_usage(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto model = llava_init(¶ms);
|
auto model = llava_init(¶ms);
|
||||||
@ -310,7 +308,7 @@ int main(int argc, char ** argv) {
|
|||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
@ -327,7 +325,7 @@ int main(int argc, char ** argv) {
|
|||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -163,11 +163,11 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
static const char * sample(struct gpt_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
|||||||
|
|
||||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||||
auto ctx_clip = clip_init_context(params);
|
auto ctx_clip = clip_init_context(params);
|
||||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embeds) {
|
if (!embeds) {
|
||||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -214,7 +214,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
|
|||||||
return ctx_llava;
|
return ctx_llava;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||||
std::string user_prompt = prompt;
|
std::string user_prompt = prompt;
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||||
if (!is_first) {
|
if (!is_first) {
|
||||||
@ -238,13 +238,13 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
|
|||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
return ctx_sampling;
|
return smpl;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
|
||||||
|
|
||||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,8 +253,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
||||||
show_additional_info(argc, argv);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,7 +266,6 @@ int main(int argc, char ** argv) {
|
|||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty())) {
|
if (params.mmproj.empty() || (params.image.empty())) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -278,12 +277,12 @@ int main(int argc, char ** argv) {
|
|||||||
if (!params.prompt.empty()) {
|
if (!params.prompt.empty()) {
|
||||||
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
||||||
LOG_TEE("<assistant>");
|
LOG_TEE("<assistant>");
|
||||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response = "";
|
||||||
bool have_tmp = false;
|
bool have_tmp = false;
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0){
|
if (strcmp(tmp, "</s>") == 0){
|
||||||
if(!have_tmp)continue;
|
if(!have_tmp)continue;
|
||||||
@ -296,18 +295,18 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}else {
|
}else {
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG_TEE("<user>");
|
LOG_TEE("<user>");
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::getline(std::cin, prompt);
|
std::getline(std::cin, prompt);
|
||||||
LOG_TEE("<assistant>");
|
LOG_TEE("<assistant>");
|
||||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response = "";
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
@ -315,11 +314,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -37,8 +36,8 @@ struct ngram_container {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,7 +117,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
// verification n-grams
|
// verification n-grams
|
||||||
std::vector<ngram_data> ngrams_cur(G);
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
@ -159,9 +158,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// sample first token
|
// sample first token
|
||||||
{
|
{
|
||||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
id = gpt_sampler_sample(smpl, ctx, 0);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
@ -284,9 +283,9 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sample the next token
|
// sample the next token
|
||||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
id = gpt_sampler_sample(smpl, ctx, i_batch);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
// print
|
// print
|
||||||
{
|
{
|
||||||
@ -361,7 +360,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// sample from the last level
|
// sample from the last level
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
@ -468,10 +467,12 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_kv_cache_view_free(&kvc_view);
|
llama_kv_cache_view_free(&kvc_view);
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -13,8 +13,8 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,8 +15,8 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,19 +3,17 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,7 +104,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
std::vector<llama_token> draft;
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
@ -130,9 +128,9 @@ int main(int argc, char ** argv){
|
|||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
|
llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
@ -240,10 +238,12 @@ int main(int argc, char ** argv){
|
|||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
llama_print_timings(ctx);
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
llama_batch_free(batch_tgt);
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -33,6 +33,7 @@
|
|||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
|
static gpt_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static gpt_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
@ -40,6 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
|
|||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
static bool need_insert_eot = false;
|
static bool need_insert_eot = false;
|
||||||
|
|
||||||
|
static void print_usage(int, char ** argv) {
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||||
|
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
static bool file_exists(const std::string & path) {
|
static bool file_exists(const std::string & path) {
|
||||||
std::ifstream f(path.c_str());
|
std::ifstream f(path.c_str());
|
||||||
return f.good();
|
return f.good();
|
||||||
@ -92,7 +100,7 @@ static void write_logfile(
|
|||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +113,7 @@ static void sigint_handler(int signo) {
|
|||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(*g_ctx);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
@ -121,8 +129,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
|||||||
|
|
||||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||||
llama_chat_msg new_msg{role, content};
|
llama_chat_msg new_msg{role, content};
|
||||||
auto formatted = llama_chat_format_single(
|
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
||||||
chat_msgs.push_back({role, content});
|
chat_msgs.push_back({role, content});
|
||||||
LOG("formatted: %s\n", formatted.c_str());
|
LOG("formatted: %s\n", formatted.c_str());
|
||||||
return formatted;
|
return formatted;
|
||||||
@ -131,13 +138,13 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_params & sparams = params.sparams;
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
log_set_target(log_filename_generator("main", "log"));
|
||||||
@ -183,27 +190,23 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
print_build_info();
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx;
|
llama_context * ctx = nullptr;
|
||||||
llama_context * ctx_guidance = NULL;
|
gpt_sampler * smpl = nullptr;
|
||||||
|
|
||||||
std::vector<llama_chat_msg> chat_msgs;
|
std::vector<llama_chat_msg> chat_msgs;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
@ -211,16 +214,43 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
if (sparams.cfg_scale > 1.f) {
|
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("%s: llama threadpool init = n_threads = %d\n",
|
||||||
|
__func__,
|
||||||
|
(int) params.cpuparams.n_threads
|
||||||
|
);
|
||||||
|
struct ggml_threadpool_params tpp_batch =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||||
|
struct ggml_threadpool_params tpp =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||||
|
|
||||||
|
set_process_priority(params.cpuparams.priority);
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool_batch = NULL;
|
||||||
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
|
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||||
|
if (!threadpool_batch) {
|
||||||
|
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the non-batch threadpool in the paused state
|
||||||
|
tpp.paused = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
LOG("n_ctx: %d\n", n_ctx);
|
||||||
@ -303,24 +333,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
std::vector<llama_token> guidance_inp;
|
|
||||||
int guidance_offset = 0;
|
|
||||||
int original_prompt_len = 0;
|
|
||||||
if (ctx_guidance) {
|
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
|
||||||
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
|
||||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
@ -352,8 +364,8 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOGLN(
|
||||||
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
|
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
|
||||||
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||||
|
|
||||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||||
// reevaluation of the last token to recalculate the cached logits
|
// reevaluation of the last token to recalculate the cached logits
|
||||||
@ -387,15 +399,6 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
LOG_TEE("\n");
|
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.n_keep > add_bos) {
|
if (params.n_keep > add_bos) {
|
||||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
@ -461,8 +464,15 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
|
||||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
|
if (!smpl) {
|
||||||
|
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
||||||
|
LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
// group-attention state
|
// group-attention state
|
||||||
@ -509,7 +519,6 @@ int main(int argc, char ** argv) {
|
|||||||
int n_remain = params.n_predict;
|
int n_remain = params.n_predict;
|
||||||
int n_consumed = 0;
|
int n_consumed = 0;
|
||||||
int n_session_consumed = 0;
|
int n_session_consumed = 0;
|
||||||
int n_past_guidance = 0;
|
|
||||||
|
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
@ -521,7 +530,6 @@ int main(int argc, char ** argv) {
|
|||||||
display = params.display_prompt;
|
display = params.display_prompt;
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> embd_guidance;
|
|
||||||
|
|
||||||
// tokenized antiprompts
|
// tokenized antiprompts
|
||||||
std::vector<std::vector<llama_token>> antiprompt_ids;
|
std::vector<std::vector<llama_token>> antiprompt_ids;
|
||||||
@ -531,12 +539,6 @@ int main(int argc, char ** argv) {
|
|||||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
||||||
if (!ctx_sampling) {
|
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
int enc_input_size = embd_inp.size();
|
int enc_input_size = embd_inp.size();
|
||||||
llama_token * enc_input_buf = embd_inp.data();
|
llama_token * enc_input_buf = embd_inp.data();
|
||||||
@ -578,7 +580,7 @@ int main(int argc, char ** argv) {
|
|||||||
// if we run out of context:
|
// if we run out of context:
|
||||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
|
if (n_past + (int) embd.size() >= n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
@ -595,11 +597,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
if (ctx_guidance) {
|
LOG("after swap: n_past = %d\n", n_past);
|
||||||
n_past_guidance -= n_discard;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||||
|
|
||||||
@ -652,46 +650,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate tokens in batches
|
|
||||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
||||||
if (ctx_guidance) {
|
|
||||||
int input_size = 0;
|
|
||||||
llama_token * input_buf = NULL;
|
|
||||||
|
|
||||||
if (n_past_guidance < (int) guidance_inp.size()) {
|
|
||||||
// Guidance context should have the same data with these modifications:
|
|
||||||
//
|
|
||||||
// * Replace the initial prompt
|
|
||||||
// * Shift everything by guidance_offset
|
|
||||||
embd_guidance = guidance_inp;
|
|
||||||
if (embd.begin() + original_prompt_len < embd.end()) {
|
|
||||||
embd_guidance.insert(
|
|
||||||
embd_guidance.end(),
|
|
||||||
embd.begin() + original_prompt_len,
|
|
||||||
embd.end()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
input_buf = embd_guidance.data();
|
|
||||||
input_size = embd_guidance.size();
|
|
||||||
|
|
||||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
|
||||||
} else {
|
|
||||||
input_buf = embd.data();
|
|
||||||
input_size = embd.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
|
||||||
int n_eval = std::min(input_size - i, params.n_batch);
|
|
||||||
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
n_past_guidance += n_eval;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||||
int n_eval = (int) embd.size() - i;
|
int n_eval = (int) embd.size() - i;
|
||||||
if (n_eval > params.n_batch) {
|
if (n_eval > params.n_batch) {
|
||||||
@ -721,7 +679,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
embd.clear();
|
embd.clear();
|
||||||
embd_guidance.clear();
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
// optionally save the session on first sample (for faster prompt loading next time)
|
// optionally save the session on first sample (for faster prompt loading next time)
|
||||||
@ -732,11 +689,11 @@ int main(int argc, char ** argv) {
|
|||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -755,7 +712,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
@ -798,7 +755,7 @@ int main(int argc, char ** argv) {
|
|||||||
// check for reverse prompt in the last n_prev tokens
|
// check for reverse prompt in the last n_prev tokens
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
const int n_prev = 32;
|
const int n_prev = 32;
|
||||||
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
|
const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
|
||||||
|
|
||||||
is_antiprompt = false;
|
is_antiprompt = false;
|
||||||
// Check if each of the reverse prompts appears at the end of the output.
|
// Check if each of the reverse prompts appears at the end of the output.
|
||||||
@ -820,7 +777,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check for reverse prompt using special tokens
|
// check for reverse prompt using special tokens
|
||||||
llama_token last_token = llama_sampling_last(ctx_sampling);
|
llama_token last_token = gpt_sampler_last(smpl);
|
||||||
for (std::vector<llama_token> ids : antiprompt_ids) {
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||||
if (ids.size() == 1 && last_token == ids[0]) {
|
if (ids.size() == 1 && last_token == ids[0]) {
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -837,7 +794,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found an EOG token\n");
|
LOG("found an EOG token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -858,7 +815,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// if current token is not EOG, we add it to current assistant message
|
// if current token is not EOG, we add it to current assistant message
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
auto id = llama_sampling_last(ctx_sampling);
|
const auto id = gpt_sampler_last(smpl);
|
||||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -954,7 +911,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
llama_sampling_reset(ctx_sampling);
|
gpt_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
@ -979,16 +936,20 @@ int main(int argc, char ** argv) {
|
|||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
|
ggml_threadpool_free(threadpool_batch);
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
LOG_TEE("Log end\n");
|
LOG_TEE("Log end\n");
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
@ -50,8 +50,8 @@ static std::vector<std::string> k_prompts = {
|
|||||||
|
|
||||||
struct client {
|
struct client {
|
||||||
~client() {
|
~client() {
|
||||||
if (ctx_sampling) {
|
if (smpl) {
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +72,7 @@ struct client {
|
|||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string response;
|
std::string response;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = nullptr;
|
struct gpt_sampler * smpl = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_date_time() {
|
static void print_date_time() {
|
||||||
@ -100,8 +100,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = i;
|
client.id = i;
|
||||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
client.smpl = gpt_sampler_init(model, params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
|
|||||||
client.prompt = client.input + "\nAssistant:";
|
client.prompt = client.input + "\nAssistant:";
|
||||||
client.response = "";
|
client.response = "";
|
||||||
|
|
||||||
llama_sampling_reset(client.ctx_sampling);
|
gpt_sampler_reset(client.smpl);
|
||||||
|
|
||||||
// do not prepend BOS because we have a system prompt!
|
// do not prepend BOS because we have a system prompt!
|
||||||
std::vector<llama_token> tokens_prompt;
|
std::vector<llama_token> tokens_prompt;
|
||||||
@ -341,9 +341,9 @@ int main(int argc, char ** argv) {
|
|||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
|
||||||
|
|
||||||
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(client.smpl, id, true);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
@ -371,7 +371,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||||
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
||||||
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
@ -413,7 +413,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
// TODO: print sampling/grammar timings for all clients
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -6,9 +6,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
@ -21,13 +19,11 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_keep = 32;
|
params.n_keep = 32;
|
||||||
params.i_pos = -1;
|
params.i_pos = -1;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
|
||||||
|
|
||||||
int n_junk = params.n_junk;
|
int n_junk = params.n_junk;
|
||||||
int n_keep = params.n_keep;
|
int n_keep = params.n_keep;
|
||||||
int n_grp = params.grp_attn_n;
|
int n_grp = params.grp_attn_n;
|
||||||
@ -80,12 +76,17 @@ int main(int argc, char ** argv) {
|
|||||||
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
@ -217,20 +218,9 @@ int main(int argc, char ** argv) {
|
|||||||
while (n_cur <= n_len) {
|
while (n_cur <= n_len) {
|
||||||
// sample the next token
|
// sample the next token
|
||||||
{
|
{
|
||||||
auto n_vocab = llama_n_vocab(model);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||||
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
llama_sampler_accept(smpl, new_token_id);
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// sample the most likely token
|
|
||||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
@ -267,10 +257,13 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -76,7 +76,7 @@ static void write_logfile(
|
|||||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||||
yaml_dump_vector_float(logfile, "probs", results.probs);
|
yaml_dump_vector_float(logfile, "probs", results.probs);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1963,8 +1963,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2003,13 +2003,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -2050,7 +2044,8 @@ int main(int argc, char ** argv) {
|
|||||||
results = perplexity(ctx, params, n_ctx);
|
results = perplexity(ctx, params, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
write_logfile(ctx, params, model, results);
|
write_logfile(ctx, params, model, results);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
cparams.n_ctx = 256;
|
cparams.n_ctx = 256;
|
||||||
cparams.seed = 1;
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, cparams);
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
@ -24,6 +24,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
|
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
||||||
|
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||||
@ -107,7 +109,7 @@ static void usage(const char * executable) {
|
|||||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||||
printf(" --keep-split: will generate quatized model in the same shards as input");
|
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||||
|
@ -4,9 +4,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
@ -113,8 +111,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,9 +291,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
llama_print_timings(ctx);
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
@ -3,15 +3,15 @@
|
|||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <chrono>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
|
params.sparams.seed = 1234;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,6 +38,13 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
// tokenize prompt
|
// tokenize prompt
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
@ -64,18 +71,11 @@ int main(int argc, char ** argv) {
|
|||||||
printf("\nfirst run: %s", params.prompt.c_str());
|
printf("\nfirst run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx);
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
|
llama_sampler_accept(smpl, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result0 += next_token_str;
|
result0 += next_token_str;
|
||||||
|
|
||||||
@ -96,6 +96,11 @@ int main(int argc, char ** argv) {
|
|||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
printf("\nsecond run: %s", params.prompt.c_str());
|
printf("\nsecond run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
@ -124,17 +129,11 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// second run
|
// second run
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx2);
|
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
|
|
||||||
|
llama_sampler_accept(smpl2, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result1 += next_token_str;
|
result1 += next_token_str;
|
||||||
|
|
||||||
@ -157,7 +156,12 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
@ -215,17 +219,11 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// third run with seq 1 instead of 0
|
// third run with seq 1 instead of 0
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx3);
|
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
|
llama_sampler_accept(smpl3, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result2 += next_token_str;
|
result2 += next_token_str;
|
||||||
|
|
||||||
@ -240,6 +238,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
llama_sampler_free(smpl2);
|
||||||
|
llama_sampler_free(smpl3);
|
||||||
|
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
@ -17,236 +17,150 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
| Argument | Explanation |
|
||||||
|
| -------- | ----------- |
|
||||||
|
| `-h, --help, --usage` | print usage and exit |
|
||||||
|
| `--version` | show version and build info |
|
||||||
|
| `-v, --verbose` | print verbose information |
|
||||||
|
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||||
|
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||||
|
| `--no-display-prompt` | don't print prompt at generation (default: false) |
|
||||||
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||||
|
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||||
|
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||||
|
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||||
|
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||||
|
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||||
|
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||||
|
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||||
|
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||||
|
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||||
|
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||||
|
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
||||||
|
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
||||||
|
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||||
|
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||||
|
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||||
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
|
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
|
||||||
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
|
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||||
|
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||||
|
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
|
||||||
|
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||||
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
|
| `--no-escape` | do not process escape sequences |
|
||||||
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
|
||||||
|
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||||
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
|
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||||
|
| `--temp N` | temperature (default: 0.8) |
|
||||||
|
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||||
|
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||||
|
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||||
|
| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||||
|
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||||
|
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||||
|
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
|
||||||
|
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
|
||||||
|
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
|
||||||
|
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||||
|
| `--grammar-file FNAME` | file to read grammar from |
|
||||||
|
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||||
|
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
|
||||||
|
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
|
||||||
|
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
|
||||||
|
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
|
||||||
|
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
|
||||||
|
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
|
||||||
|
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
|
||||||
|
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
|
||||||
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
|
||||||
|
| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
|
||||||
|
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
|
||||||
|
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||||
|
| `-nkvo, --no-kv-offload` | disable KV offload |
|
||||||
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
|
||||||
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||||
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
||||||
|
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
|
||||||
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
|
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||||
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||||
|
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||||
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||||
|
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||||
|
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||||
|
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||||
|
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||||
|
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||||
|
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||||
|
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||||
|
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||||
|
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||||
|
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||||
|
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||||
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
|
||||||
|
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||||
|
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||||
|
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||||
|
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||||
|
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||||
|
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||||
|
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||||
|
| `--path PATH` | path to serve static files from (default: ) |
|
||||||
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
|
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
|
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||||
|
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||||
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||||
|
| `--timeout N` | server read/write timeout in seconds (default: 600) |
|
||||||
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
|
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
||||||
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
|
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||||
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
|
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
|
||||||
|
| `--log-test` | Log test |
|
||||||
|
| `--log-disable` | Log disable |
|
||||||
|
| `--log-enable` | Log enable |
|
||||||
|
| `--log-new` | Log new |
|
||||||
|
| `--log-append` | Log append |
|
||||||
|
| `--log-file FNAME` | Log file |
|
||||||
|
|
||||||
|
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||||
|
|
||||||
|
Example usage of docker compose with environment variables:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
services:
|
||||||
|
llamacpp-server:
|
||||||
|
image: ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
ports:
|
||||||
|
- 8080:8080
|
||||||
|
volumes:
|
||||||
|
- ./models:/models
|
||||||
|
environment:
|
||||||
|
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
|
||||||
|
LLAMA_ARG_MODEL: /models/my_model.gguf
|
||||||
|
LLAMA_ARG_CTX_SIZE: 4096
|
||||||
|
LLAMA_ARG_N_PARALLEL: 2
|
||||||
|
LLAMA_ARG_ENDPOINT_METRICS: 1
|
||||||
|
LLAMA_ARG_PORT: 8080
|
||||||
```
|
```
|
||||||
usage: ./llama-server [options]
|
|
||||||
|
|
||||||
general:
|
|
||||||
|
|
||||||
-h, --help, --usage print usage and exit
|
|
||||||
--version show version and build info
|
|
||||||
-v, --verbose print verbose information
|
|
||||||
--verbosity N set specific verbosity level (default: 0)
|
|
||||||
--verbose-prompt print a verbose prompt before generation (default: false)
|
|
||||||
--no-display-prompt don't print prompt at generation (default: false)
|
|
||||||
-co, --color colorise output to distinguish prompt and user input from generations (default: false)
|
|
||||||
-s, --seed SEED RNG seed (default: -1, use random seed for < 0)
|
|
||||||
-t, --threads N number of threads to use during generation (default: 8)
|
|
||||||
-tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
|
|
||||||
-td, --threads-draft N number of threads to use during generation (default: same as --threads)
|
|
||||||
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
|
|
||||||
--draft N number of tokens to draft for speculative decoding (default: 5)
|
|
||||||
-ps, --p-split N speculative decoding split probability (default: 0.1)
|
|
||||||
-lcs, --lookup-cache-static FNAME
|
|
||||||
path to static lookup cache to use for lookup decoding (not updated by generation)
|
|
||||||
-lcd, --lookup-cache-dynamic FNAME
|
|
||||||
path to dynamic lookup cache to use for lookup decoding (updated by generation)
|
|
||||||
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
|
||||||
-n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
|
|
||||||
-b, --batch-size N logical maximum batch size (default: 2048)
|
|
||||||
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
|
||||||
--keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
|
|
||||||
--chunks N max number of chunks to process (default: -1, -1 = all)
|
|
||||||
-fa, --flash-attn enable Flash Attention (default: disabled)
|
|
||||||
-p, --prompt PROMPT prompt to start generation with
|
|
||||||
in conversation mode, this will be used as system prompt
|
|
||||||
(default: '')
|
|
||||||
-f, --file FNAME a file containing the prompt (default: none)
|
|
||||||
--in-file FNAME an input file (repeat to specify multiple files)
|
|
||||||
-bf, --binary-file FNAME binary file containing the prompt (default: none)
|
|
||||||
-e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
|
|
||||||
--no-escape do not process escape sequences
|
|
||||||
-ptc, --print-token-count N print token count every N tokens (default: -1)
|
|
||||||
--prompt-cache FNAME file to cache prompt state for faster startup (default: none)
|
|
||||||
--prompt-cache-all if specified, saves user input and generations to cache as well
|
|
||||||
not supported with --interactive or other interactive options
|
|
||||||
--prompt-cache-ro if specified, uses the prompt cache but does not update it
|
|
||||||
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
|
||||||
can be specified more than once for multiple prompts
|
|
||||||
-sp, --special special tokens output enabled (default: false)
|
|
||||||
-cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
|
|
||||||
if suffix/prefix are not specified, default chat template will be used
|
|
||||||
(default: false)
|
|
||||||
-i, --interactive run in interactive mode (default: false)
|
|
||||||
-if, --interactive-first run in interactive mode and wait for input right away (default: false)
|
|
||||||
-mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
|
|
||||||
--in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
|
|
||||||
--in-prefix STRING string to prefix user inputs with (default: empty)
|
|
||||||
--in-suffix STRING string to suffix after user inputs with (default: empty)
|
|
||||||
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
|
||||||
|
|
||||||
sampling:
|
|
||||||
|
|
||||||
--samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
|
|
||||||
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
|
|
||||||
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
|
|
||||||
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
|
|
||||||
--penalize-nl penalize newline tokens (default: false)
|
|
||||||
--temp N temperature (default: 0.8)
|
|
||||||
--top-k N top-k sampling (default: 40, 0 = disabled)
|
|
||||||
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
|
|
||||||
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
|
|
||||||
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
|
|
||||||
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
|
|
||||||
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
|
|
||||||
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
|
|
||||||
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
|
|
||||||
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
|
|
||||||
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
|
|
||||||
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
|
|
||||||
--mirostat N use Mirostat sampling.
|
|
||||||
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
|
|
||||||
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
|
||||||
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
|
|
||||||
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
|
|
||||||
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
|
||||||
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
|
||||||
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
|
||||||
--cfg-negative-prompt PROMPT
|
|
||||||
negative prompt to use for guidance (default: '')
|
|
||||||
--cfg-negative-prompt-file FNAME
|
|
||||||
negative prompt file to use for guidance
|
|
||||||
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
|
|
||||||
--chat-template JINJA_TEMPLATE
|
|
||||||
set custom jinja chat template (default: template taken from model's metadata)
|
|
||||||
if suffix/prefix are specified, template will be disabled
|
|
||||||
only commonly used templates are accepted:
|
|
||||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
||||||
|
|
||||||
grammar:
|
|
||||||
|
|
||||||
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
|
|
||||||
--grammar-file FNAME file to read grammar from
|
|
||||||
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
|
|
||||||
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
|
|
||||||
--pooling {none,mean,cls,last}
|
|
||||||
pooling type for embeddings, use model default if unspecified
|
|
||||||
--attention {causal,non-causal}
|
|
||||||
attention type for embeddings, use model default if unspecified
|
|
||||||
|
|
||||||
context hacking:
|
|
||||||
|
|
||||||
--rope-scaling {none,linear,yarn}
|
|
||||||
RoPE frequency scaling method, defaults to linear unless specified by the model
|
|
||||||
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
|
||||||
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
|
|
||||||
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
|
||||||
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
|
|
||||||
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
|
|
||||||
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
|
||||||
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
|
|
||||||
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
|
|
||||||
-gan, --grp-attn-n N group-attention factor (default: 1)
|
|
||||||
-gaw, --grp-attn-w N group-attention width (default: 512.0)
|
|
||||||
-dkvc, --dump-kv-cache verbose print of the KV cache
|
|
||||||
-nkvo, --no-kv-offload disable KV offload
|
|
||||||
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
|
|
||||||
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
|
|
||||||
|
|
||||||
perplexity:
|
|
||||||
|
|
||||||
--all-logits return logits for all tokens in the batch (default: false)
|
|
||||||
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
|
|
||||||
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
|
|
||||||
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
|
|
||||||
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
|
|
||||||
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
|
|
||||||
--multiple-choice-tasks N
|
|
||||||
number of tasks to use when computing the multiple choice score (default: 0)
|
|
||||||
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
|
|
||||||
--ppl-stride N stride for perplexity calculation (default: 0)
|
|
||||||
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
|
|
||||||
|
|
||||||
parallel:
|
|
||||||
|
|
||||||
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
|
|
||||||
-np, --parallel N number of parallel sequences to decode (default: 1)
|
|
||||||
-ns, --sequences N number of sequences to decode (default: 1)
|
|
||||||
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
|
|
||||||
|
|
||||||
multi-modality:
|
|
||||||
|
|
||||||
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
|
|
||||||
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
|
|
||||||
|
|
||||||
backend:
|
|
||||||
|
|
||||||
--rpc SERVERS comma separated list of RPC servers
|
|
||||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
|
||||||
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
|
|
||||||
--numa TYPE attempt optimizations that help on some NUMA systems
|
|
||||||
- distribute: spread execution evenly over all nodes
|
|
||||||
- isolate: only spawn threads on CPUs on the node that execution started on
|
|
||||||
- numactl: use the CPU map provided by numactl
|
|
||||||
if run without this previously, it is recommended to drop the system page cache before using this
|
|
||||||
see https://github.com/ggerganov/llama.cpp/issues/1437
|
|
||||||
|
|
||||||
model:
|
|
||||||
|
|
||||||
--check-tensors check model tensor data for invalid values (default: false)
|
|
||||||
--override-kv KEY=TYPE:VALUE
|
|
||||||
advanced option to override model metadata by key. may be specified multiple times.
|
|
||||||
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
|
|
||||||
--lora FNAME apply LoRA adapter (implies --no-mmap)
|
|
||||||
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
|
|
||||||
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
|
|
||||||
--control-vector FNAME add a control vector
|
|
||||||
note: this argument can be repeated to add multiple control vectors
|
|
||||||
--control-vector-scaled FNAME SCALE
|
|
||||||
add a control vector with user defined scaling SCALE
|
|
||||||
note: this argument can be repeated to add multiple scaled control vectors
|
|
||||||
--control-vector-layer-range START END
|
|
||||||
layer range to apply the control vector(s) to, start and end inclusive
|
|
||||||
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
|
|
||||||
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
|
|
||||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
|
||||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
|
||||||
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
|
|
||||||
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
|
||||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
|
|
||||||
|
|
||||||
server:
|
|
||||||
|
|
||||||
--host HOST ip address to listen (default: 127.0.0.1)
|
|
||||||
--port PORT port to listen (default: 8080)
|
|
||||||
--path PATH path to serve static files from (default: )
|
|
||||||
--embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
|
|
||||||
--api-key KEY API key to use for authentication (default: none)
|
|
||||||
--api-key-file FNAME path to file containing API keys (default: none)
|
|
||||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
|
||||||
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
|
||||||
--timeout N server read/write timeout in seconds (default: 600)
|
|
||||||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
|
||||||
--system-prompt-file FNAME
|
|
||||||
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
|
|
||||||
--log-format {text,json}
|
|
||||||
log output format: json or text (default: json)
|
|
||||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
|
||||||
--no-slots disables slots monitoring endpoint (default: enabled)
|
|
||||||
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
|
||||||
--chat-template JINJA_TEMPLATE
|
|
||||||
set custom jinja chat template (default: template taken from model's metadata)
|
|
||||||
only commonly used templates are accepted:
|
|
||||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
||||||
-sps, --slot-prompt-similarity SIMILARITY
|
|
||||||
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
|
||||||
--lora-init-without-apply
|
|
||||||
load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
|
|
||||||
|
|
||||||
logging:
|
|
||||||
|
|
||||||
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
|
|
||||||
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
|
|
||||||
--log-test Run simple logging test
|
|
||||||
--log-disable Disable trace logs
|
|
||||||
--log-enable Enable trace logs
|
|
||||||
--log-file FNAME Specify a log filename (without extension)
|
|
||||||
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
|
|
||||||
--log-append Don't truncate the old log file.
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
@ -425,8 +339,6 @@ node index.js
|
|||||||
|
|
||||||
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
|
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
|
||||||
|
|
||||||
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
|
|
||||||
|
|
||||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
|
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
|
||||||
|
|
||||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
|
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
|
||||||
@ -679,7 +591,6 @@ Example:
|
|||||||
"stopping_word": ""
|
"stopping_word": ""
|
||||||
},
|
},
|
||||||
"penalize_nl": true,
|
"penalize_nl": true,
|
||||||
"penalty_prompt_tokens": [],
|
|
||||||
"presence_penalty": 0.0,
|
"presence_penalty": 0.0,
|
||||||
"prompt": "Say hello to llama.cpp",
|
"prompt": "Say hello to llama.cpp",
|
||||||
"repeat_last_n": 64,
|
"repeat_last_n": 64,
|
||||||
@ -703,8 +614,7 @@ Example:
|
|||||||
"tfs_z": 1.0,
|
"tfs_z": 1.0,
|
||||||
"top_k": 40,
|
"top_k": 40,
|
||||||
"top_p": 0.949999988079071,
|
"top_p": 0.949999988079071,
|
||||||
"typical_p": 1.0,
|
"typical_p": 1.0
|
||||||
"use_penalty_prompt_tokens": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -9,8 +9,11 @@ Feature: llama.cpp server
|
|||||||
And a model alias bert-bge-small
|
And a model alias bert-bge-small
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 2 slots
|
And 2 slots
|
||||||
And 1024 as batch size
|
# the bert-bge-small model has context size of 512
|
||||||
And 1024 as ubatch size
|
# since the generated prompts are as big as the batch size, we need to set the batch size to 512
|
||||||
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
|
||||||
|
And 512 as batch size
|
||||||
|
And 512 as ubatch size
|
||||||
And 2048 KV cache size
|
And 2048 KV cache size
|
||||||
And embeddings extraction
|
And embeddings extraction
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
@ -77,6 +77,35 @@ Feature: Parallel
|
|||||||
| disabled | 128 |
|
| disabled | 128 |
|
||||||
| enabled | 64 |
|
| enabled | 64 |
|
||||||
|
|
||||||
|
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||||
|
Given a system prompt You are a writer.
|
||||||
|
And a model tinyllama-2
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long book.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another a poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is LLM?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
The sky is blue and I love it.
|
||||||
|
"""
|
||||||
|
And <n_predict> max tokens to predict
|
||||||
|
And streaming is <streaming>
|
||||||
|
Given concurrent OAI completions requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all prompts are predicted with <n_predict> tokens
|
||||||
|
Examples:
|
||||||
|
| streaming | n_predict |
|
||||||
|
| disabled | 128 |
|
||||||
|
| enabled | 64 |
|
||||||
|
|
||||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
@ -15,6 +15,7 @@ Feature: Passkey / Self-extend with context shift
|
|||||||
And <n_junk> as number of junk
|
And <n_junk> as number of junk
|
||||||
And <n_predicted> server max tokens to predict
|
And <n_predicted> server max tokens to predict
|
||||||
And 42 as seed
|
And 42 as seed
|
||||||
|
And 0.0 temperature
|
||||||
And <n_ctx> KV cache size
|
And <n_ctx> KV cache size
|
||||||
And 1 slots
|
And 1 slots
|
||||||
And <n_ga> group attention factor to extend context size through self-extend
|
And <n_ga> group attention factor to extend context size through self-extend
|
||||||
@ -22,7 +23,8 @@ Feature: Passkey / Self-extend with context shift
|
|||||||
# Can be override with N_GPU_LAYERS
|
# Can be override with N_GPU_LAYERS
|
||||||
And <ngl> GPU offloaded layers
|
And <ngl> GPU offloaded layers
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
# Higher timeout because the model may need to be downloaded from the internet
|
||||||
|
Then the server is healthy with timeout 120 seconds
|
||||||
Given available models
|
Given available models
|
||||||
Then model 0 is trained on <n_ctx_train> tokens context
|
Then model 0 is trained on <n_ctx_train> tokens context
|
||||||
Given a prefix prompt:
|
Given a prefix prompt:
|
||||||
|
@ -23,6 +23,8 @@ from prometheus_client import parser
|
|||||||
|
|
||||||
# pyright: reportRedeclaration=false
|
# pyright: reportRedeclaration=false
|
||||||
|
|
||||||
|
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
|
||||||
|
|
||||||
@step("a server listening on {server_fqdn}:{server_port}")
|
@step("a server listening on {server_fqdn}:{server_port}")
|
||||||
def step_server_config(context, server_fqdn: str, server_port: str):
|
def step_server_config(context, server_fqdn: str, server_port: str):
|
||||||
context.server_fqdn = server_fqdn
|
context.server_fqdn = server_fqdn
|
||||||
@ -200,17 +202,15 @@ def step_start_server(context):
|
|||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
|
||||||
@step("the server is {expecting_status}")
|
async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||||
@async_run_until_complete
|
|
||||||
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
|
||||||
match expecting_status:
|
match expecting_status:
|
||||||
case 'healthy':
|
case 'healthy':
|
||||||
await wait_for_slots_status(context, context.base_url, 200,
|
await wait_for_slots_status(context, context.base_url, 200,
|
||||||
timeout=30)
|
timeout=timeout)
|
||||||
|
|
||||||
case 'ready' | 'idle':
|
case 'ready' | 'idle':
|
||||||
await wait_for_slots_status(context, context.base_url, 200,
|
await wait_for_slots_status(context, context.base_url, 200,
|
||||||
timeout=30,
|
timeout=timeout,
|
||||||
params={'fail_on_no_slot': 1},
|
params={'fail_on_no_slot': 1},
|
||||||
slots_idle=context.n_slots,
|
slots_idle=context.n_slots,
|
||||||
slots_processing=0)
|
slots_processing=0)
|
||||||
@ -223,6 +223,18 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status: Lite
|
|||||||
assert False, "unknown status"
|
assert False, "unknown status"
|
||||||
|
|
||||||
|
|
||||||
|
@step("the server is {expecting_status} with timeout {timeout:d} seconds")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||||
|
await wait_for_server_status_with_timeout(context, expecting_status, timeout)
|
||||||
|
|
||||||
|
|
||||||
|
@step("the server is {expecting_status}")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||||
|
await wait_for_server_status_with_timeout(context, expecting_status, 30)
|
||||||
|
|
||||||
|
|
||||||
@step('all slots are {expected_slot_status_string}')
|
@step('all slots are {expected_slot_status_string}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||||
@ -689,7 +701,7 @@ def step_tokenize_set_add_special(context):
|
|||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_tokenize(context):
|
async def step_tokenize(context):
|
||||||
context.tokenized_text = context_text(context)
|
context.tokenized_text = context_text(context)
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
tokenize_args = {
|
tokenize_args = {
|
||||||
"content": context.tokenized_text,
|
"content": context.tokenized_text,
|
||||||
}
|
}
|
||||||
@ -706,7 +718,7 @@ async def step_tokenize(context):
|
|||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_detokenize(context):
|
async def step_detokenize(context):
|
||||||
assert len(context.tokens) > 0
|
assert len(context.tokens) > 0
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{context.base_url}/detokenize',
|
async with session.post(f'{context.base_url}/detokenize',
|
||||||
json={
|
json={
|
||||||
"tokens": context.tokens,
|
"tokens": context.tokens,
|
||||||
@ -735,7 +747,7 @@ def step_strings_for_tokenization(context):
|
|||||||
@step('an OPTIONS request is sent from {origin}')
|
@step('an OPTIONS request is sent from {origin}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_options_request(context, origin):
|
async def step_options_request(context, origin):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin}
|
||||||
async with session.options(f'{context.base_url}/v1/chat/completions',
|
async with session.options(f'{context.base_url}/v1/chat/completions',
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
@ -751,7 +763,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value):
|
|||||||
@step('prometheus metrics are exposed')
|
@step('prometheus metrics are exposed')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_prometheus_metrics_exported(context):
|
async def step_prometheus_metrics_exported(context):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
async with await session.get(f'{context.base_url}/metrics') as metrics_response:
|
||||||
assert metrics_response.status == 200
|
assert metrics_response.status == 200
|
||||||
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4"
|
||||||
@ -818,13 +830,13 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
|||||||
for prompt_no in range(context.n_prompts):
|
for prompt_no in range(context.n_prompts):
|
||||||
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
|
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
|
||||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_save_slot(context, slot_id, filename):
|
async def step_save_slot(context, slot_id, filename):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=save',
|
||||||
json={"filename": filename},
|
json={"filename": filename},
|
||||||
headers={"Content-Type": "application/json"}) as response:
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
@ -834,7 +846,7 @@ async def step_save_slot(context, slot_id, filename):
|
|||||||
@step('the slot {slot_id:d} is restored with filename "{filename}"')
|
@step('the slot {slot_id:d} is restored with filename "{filename}"')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_restore_slot(context, slot_id, filename):
|
async def step_restore_slot(context, slot_id, filename):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=restore',
|
||||||
json={"filename": filename},
|
json={"filename": filename},
|
||||||
headers={"Content-Type": "application/json"}) as response:
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
@ -844,7 +856,7 @@ async def step_restore_slot(context, slot_id, filename):
|
|||||||
@step('the slot {slot_id:d} is erased')
|
@step('the slot {slot_id:d} is erased')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_erase_slot(context, slot_id):
|
async def step_erase_slot(context, slot_id):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
|
async with session.post(f'{context.base_url}/slots/{slot_id}?action=erase',
|
||||||
headers={"Content-Type": "application/json"}) as response:
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
context.response = response
|
context.response = response
|
||||||
@ -853,7 +865,7 @@ async def step_erase_slot(context, slot_id):
|
|||||||
@step('switch {on_or_off} lora adapter {lora_id:d}')
|
@step('switch {on_or_off} lora adapter {lora_id:d}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
|
async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{context.base_url}/lora-adapters',
|
async with session.post(f'{context.base_url}/lora-adapters',
|
||||||
json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
|
json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
|
||||||
headers={"Content-Type": "application/json"}) as response:
|
headers={"Content-Type": "application/json"}) as response:
|
||||||
@ -889,7 +901,7 @@ async def request_completion(prompt,
|
|||||||
print(f"Set user_api_key: {user_api_key}")
|
print(f"Set user_api_key: {user_api_key}")
|
||||||
headers['Authorization'] = f'Bearer {user_api_key}'
|
headers['Authorization'] = f'Bearer {user_api_key}'
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{base_url}/completion',
|
async with session.post(f'{base_url}/completion',
|
||||||
json={
|
json={
|
||||||
"input_prefix": prompt_prefix,
|
"input_prefix": prompt_prefix,
|
||||||
@ -902,8 +914,7 @@ async def request_completion(prompt,
|
|||||||
"temperature": temperature if temperature is not None else 0.8,
|
"temperature": temperature if temperature is not None else 0.8,
|
||||||
"n_probs": 2,
|
"n_probs": 2,
|
||||||
},
|
},
|
||||||
headers=headers,
|
headers=headers) as response:
|
||||||
timeout=3600) as response:
|
|
||||||
if expect_api_error is None or not expect_api_error:
|
if expect_api_error is None or not expect_api_error:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
@ -961,7 +972,7 @@ async def oai_chat_completions(user_prompt,
|
|||||||
if async_client:
|
if async_client:
|
||||||
origin = 'llama.cpp'
|
origin = 'llama.cpp'
|
||||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{base_url}{base_path}',
|
async with session.post(f'{base_url}{base_path}',
|
||||||
json=payload,
|
json=payload,
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
@ -1048,7 +1059,7 @@ async def oai_chat_completions(user_prompt,
|
|||||||
|
|
||||||
|
|
||||||
async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
|
async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{base_url}/embedding',
|
async with session.post(f'{base_url}/embedding',
|
||||||
json={
|
json={
|
||||||
"content": content,
|
"content": content,
|
||||||
@ -1068,14 +1079,13 @@ async def request_oai_embeddings(input, seed,
|
|||||||
headers=[]
|
headers=[]
|
||||||
if user_api_key is not None:
|
if user_api_key is not None:
|
||||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with session.post(f'{base_url}/v1/embeddings',
|
async with session.post(f'{base_url}/v1/embeddings',
|
||||||
json={
|
json={
|
||||||
"input": input,
|
"input": input,
|
||||||
"model": model,
|
"model": model,
|
||||||
},
|
},
|
||||||
headers=headers,
|
headers=headers) as response:
|
||||||
timeout=3600) as response:
|
|
||||||
assert response.status == 200, f"received status code not expected: {response.status}"
|
assert response.status == 200, f"received status code not expected: {response.status}"
|
||||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
assert response.headers['Content-Type'] == "application/json; charset=utf-8"
|
||||||
@ -1194,7 +1204,7 @@ async def wait_for_slots_status(context,
|
|||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
timeout *= 2
|
timeout *= 2
|
||||||
|
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
while True:
|
while True:
|
||||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
||||||
status_code = slots_response.status
|
status_code = slots_response.status
|
||||||
@ -1237,7 +1247,7 @@ def assert_embeddings(embeddings):
|
|||||||
|
|
||||||
|
|
||||||
async def request_slots_status(context, expected_slots):
|
async def request_slots_status(context, expected_slots):
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
async with await session.get(f'{context.base_url}/slots') as slots_response:
|
async with await session.get(f'{context.base_url}/slots') as slots_response:
|
||||||
assert slots_response.status == 200
|
assert slots_response.status == 200
|
||||||
slots = await slots_response.json()
|
slots = await slots_response.json()
|
||||||
|
@ -8,9 +8,12 @@ Feature: Wrong usage of llama.cpp server
|
|||||||
Scenario: Infinite loop
|
Scenario: Infinite loop
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And 42 as server seed
|
||||||
|
And 2048 KV cache size
|
||||||
# Uncomment below to fix the issue
|
# Uncomment below to fix the issue
|
||||||
#And 64 server max tokens to predict
|
#And 64 server max tokens to predict
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
"""
|
"""
|
||||||
Go to: infinite loop
|
Go to: infinite loop
|
||||||
|
@ -3,6 +3,14 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||||
|
#endif
|
||||||
|
// increase max payload length to allow use of larger context size
|
||||||
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
|
#include "httplib.h"
|
||||||
|
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
@ -279,6 +287,18 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
|||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool json_is_array_of_numbers(json data) {
|
||||||
|
if (data.is_array()) {
|
||||||
|
for (const auto & e : data) {
|
||||||
|
if (!e.is_number()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: reuse llama_detokenize
|
// TODO: reuse llama_detokenize
|
||||||
template <class Iter>
|
template <class Iter>
|
||||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
@ -343,6 +363,19 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
|
||||||
|
const std::string str =
|
||||||
|
std::string(event) + ": " +
|
||||||
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
|
||||||
|
return sink.write(str.c_str(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// OAI utils
|
// OAI utils
|
||||||
//
|
//
|
||||||
|
@ -6,9 +6,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG_TEE("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
@ -20,8 +18,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||||
print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,6 +53,14 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
sparams.no_perf = false;
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
@ -110,20 +116,9 @@ int main(int argc, char ** argv) {
|
|||||||
while (n_cur <= n_predict) {
|
while (n_cur <= n_predict) {
|
||||||
// sample the next token
|
// sample the next token
|
||||||
{
|
{
|
||||||
auto n_vocab = llama_n_vocab(model);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||||
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
llama_sampler_accept(smpl, new_token_id);
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// sample the most likely token
|
|
||||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
@ -160,12 +155,14 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_TEE("\n");
|
||||||
|
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||||
|
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
@ -21,14 +21,14 @@ struct seq_draft {
|
|||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
std::vector<std::vector<llama_token_data>> dists;
|
std::vector<std::vector<llama_token_data>> dists;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling;
|
struct gpt_sampler * smpl = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
||||||
gpt_params_print_usage(argc, argv, params);
|
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,10 +43,7 @@ int main(int argc, char ** argv) {
|
|||||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||||
const float p_split = params.p_split;
|
const float p_split = params.p_split;
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
std::default_random_engine rng(params.sparams.seed);
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
std::default_random_engine rng(params.seed);
|
|
||||||
std::uniform_real_distribution<> u_dist;
|
std::uniform_real_distribution<> u_dist;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
@ -73,10 +70,11 @@ int main(int argc, char ** argv) {
|
|||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
if (params.n_threads_draft > 0) {
|
if (params.draft_cpuparams.n_threads > 0) {
|
||||||
params.n_threads = params.n_threads_draft;
|
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
||||||
}
|
}
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
|
||||||
|
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model;
|
||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context;
|
||||||
@ -178,19 +176,17 @@ int main(int argc, char ** argv) {
|
|||||||
// used to determine end of generation
|
// used to determine end of generation
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context (reuse the llama_context's sampling instance)
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
|
||||||
|
|
||||||
|
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
||||||
|
|
||||||
// draft sequence data
|
// draft sequence data
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
|
||||||
if (params.sparams.temp == 0) {
|
|
||||||
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
// allocate gpt_sampler for each draft sequence
|
||||||
|
drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||||
@ -232,12 +228,12 @@ int main(int argc, char ** argv) {
|
|||||||
bool accept = false;
|
bool accept = false;
|
||||||
if (params.sparams.temp > 0) {
|
if (params.sparams.temp > 0) {
|
||||||
// stochastic verification
|
// stochastic verification
|
||||||
|
gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||||
|
|
||||||
llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
|
auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
|
||||||
llama_sample_softmax(ctx_tgt, &dist_tgt);
|
|
||||||
float p_tgt = 0, p_dft = 0;
|
|
||||||
|
|
||||||
// GGML_ASSERT(dist_tgt.size() == dist_dft.size());
|
float p_tgt = 0.0f;
|
||||||
|
float p_dft = 0.0f;
|
||||||
|
|
||||||
while (active_seqs.size() > 0) {
|
while (active_seqs.size() > 0) {
|
||||||
// randomly select a sequence to verify from active sequences
|
// randomly select a sequence to verify from active sequences
|
||||||
@ -256,9 +252,13 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||||
float r = u_dist(rng);
|
float r = u_dist(rng);
|
||||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
|
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
||||||
|
|
||||||
|
//GGML_ASSERT(dist_tgt.size <= dist_dft.size);
|
||||||
|
|
||||||
// acquire the token probabilities assigned by the draft and target models
|
// acquire the token probabilities assigned by the draft and target models
|
||||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
||||||
@ -277,7 +277,7 @@ int main(int argc, char ** argv) {
|
|||||||
accept = true;
|
accept = true;
|
||||||
token_id = drafts[s].tokens[i_dft];
|
token_id = drafts[s].tokens[i_dft];
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
gpt_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
break;
|
break;
|
||||||
@ -288,7 +288,6 @@ int main(int argc, char ** argv) {
|
|||||||
// calculate residual probability
|
// calculate residual probability
|
||||||
GGML_ASSERT(dist_tgt.sorted);
|
GGML_ASSERT(dist_tgt.sorted);
|
||||||
GGML_ASSERT(dist_dft.sorted);
|
GGML_ASSERT(dist_dft.sorted);
|
||||||
float sum_probs = 0.0f;
|
|
||||||
|
|
||||||
// sort dist by id
|
// sort dist by id
|
||||||
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
||||||
@ -298,10 +297,18 @@ int main(int argc, char ** argv) {
|
|||||||
return a.id < b.id;
|
return a.id < b.id;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
float sum_probs = 0.0f;
|
||||||
|
|
||||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
|
if (i < dist_dft.size) {
|
||||||
|
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
|
||||||
|
} else {
|
||||||
|
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
|
||||||
|
}
|
||||||
|
|
||||||
sum_probs += dist_tgt.data[i].p;
|
sum_probs += dist_tgt.data[i].p;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < dist_tgt.size; i++) {
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
||||||
dist_tgt.data[i].p /= sum_probs;
|
dist_tgt.data[i].p /= sum_probs;
|
||||||
}
|
}
|
||||||
@ -331,21 +338,29 @@ int main(int argc, char ** argv) {
|
|||||||
// all drafted tokens were rejected
|
// all drafted tokens were rejected
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||||
token_id = llama_sample_token(ctx_tgt, &dist_tgt);
|
std::vector<float> probs(dist_tgt.size);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
||||||
|
probs[i] = dist_tgt.data[i].p;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||||
|
|
||||||
|
const int idx = dist(rng);
|
||||||
|
|
||||||
|
token_id = dist_tgt.data[idx].id;
|
||||||
|
gpt_sampler_accept(smpl, token_id, true);
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// greedy verification
|
// greedy verification
|
||||||
|
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
token_id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx_tgt, token_id, true);
|
gpt_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
|
||||||
|
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
|
|
||||||
@ -433,7 +448,10 @@ int main(int argc, char ** argv) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
|
if (drafts[0].smpl) {
|
||||||
|
gpt_sampler_free(drafts[0].smpl);
|
||||||
|
}
|
||||||
|
drafts[0].smpl = gpt_sampler_clone(smpl);
|
||||||
|
|
||||||
int n_seq_cur = 1;
|
int n_seq_cur = 1;
|
||||||
int n_past_cur = n_past_dft;
|
int n_past_cur = n_past_dft;
|
||||||
@ -462,20 +480,20 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
|
gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
||||||
|
|
||||||
const auto & cur_p = drafts[s].ctx_sampling->cur;
|
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
||||||
|
|
||||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
|
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
||||||
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> sa(1, s);
|
std::vector<int> sa(1, s);
|
||||||
|
|
||||||
// attempt to split the branch if the probability is high enough
|
// attempt to split the branch if the probability is high enough
|
||||||
for (int f = 1; f < 8; ++f) {
|
for (int f = 1; f < 8; ++f) {
|
||||||
if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
||||||
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||||
@ -502,7 +520,10 @@ int main(int argc, char ** argv) {
|
|||||||
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
||||||
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
||||||
|
|
||||||
llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
|
if (drafts[n_seq_cur].smpl) {
|
||||||
|
gpt_sampler_free(drafts[n_seq_cur].smpl);
|
||||||
|
}
|
||||||
|
drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
|
||||||
|
|
||||||
sa.push_back(n_seq_cur);
|
sa.push_back(n_seq_cur);
|
||||||
|
|
||||||
@ -514,15 +535,15 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// add drafted token for each sequence
|
// add drafted token for each sequence
|
||||||
for (int is = 0; is < (int) sa.size(); ++is) {
|
for (int is = 0; is < (int) sa.size(); ++is) {
|
||||||
const llama_token id = cur_p[is].id;
|
const llama_token id = cur_p->data[is].id;
|
||||||
|
|
||||||
const int s = sa[is];
|
const int s = sa[is];
|
||||||
|
|
||||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
gpt_sampler_accept(drafts[s].smpl, id, true);
|
||||||
|
|
||||||
drafts[s].tokens.push_back(id);
|
drafts[s].tokens.push_back(id);
|
||||||
// save cur_p.data into drafts[s].dists
|
// save cur_p.data into drafts[s].dists
|
||||||
drafts[s].dists.push_back(cur_p);
|
drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
|
||||||
|
|
||||||
// add unique drafted tokens to the target batch
|
// add unique drafted tokens to the target batch
|
||||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||||
@ -592,17 +613,19 @@ int main(int argc, char ** argv) {
|
|||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ndraft:\n");
|
LOG_TEE("\ndraft:\n\n");
|
||||||
llama_print_timings(ctx_dft);
|
// TODO: print sampling/grammar timings for all drafts
|
||||||
|
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_TEE("\ntarget:\n\n");
|
||||||
llama_print_timings(ctx_tgt);
|
gpt_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
llama_sampling_free(drafts[s].ctx_sampling);
|
gpt_sampler_free(drafts[s].smpl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampler_free(softmax);
|
||||||
llama_batch_free(batch_dft);
|
llama_batch_free(batch_dft);
|
||||||
|
|
||||||
llama_free(ctx_tgt);
|
llama_free(ctx_tgt);
|
||||||
|
12
flake.lock
12
flake.lock
@ -5,11 +5,11 @@
|
|||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1722555600,
|
"lastModified": 1725024810,
|
||||||
"narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=",
|
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "8471fe90ad337a8074e957b69ca4d0089218391d",
|
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
@ -20,11 +20,11 @@
|
|||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1723637854,
|
"lastModified": 1724819573,
|
||||||
"narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=",
|
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9",
|
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -145,7 +145,9 @@
|
|||||||
# the same path you would with an overlay.
|
# the same path you would with an overlay.
|
||||||
legacyPackages = {
|
legacyPackages = {
|
||||||
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix {
|
||||||
|
inherit llamaVersion;
|
||||||
|
};
|
||||||
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; };
|
||||||
};
|
};
|
||||||
@ -157,6 +159,7 @@
|
|||||||
default = config.legacyPackages.llamaPackages.llama-cpp;
|
default = config.legacyPackages.llamaPackages.llama-cpp;
|
||||||
vulkan = config.packages.default.override { useVulkan = true; };
|
vulkan = config.packages.default.override { useVulkan = true; };
|
||||||
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
|
||||||
|
python-scripts = config.legacyPackages.llamaPackages.python-scripts;
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
// lib.optionalAttrs pkgs.stdenv.isLinux {
|
||||||
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
|
||||||
|
@ -135,6 +135,7 @@ option(GGML_VULKAN "ggml: use Vulkan"
|
|||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||||
|
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
|
||||||
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
|
||||||
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||||
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||||
|
@ -7,8 +7,8 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
// Tensor allocator
|
// Tensor allocator
|
||||||
struct ggml_tallocr {
|
struct ggml_tallocr {
|
||||||
|
@ -63,6 +63,7 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
|
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
@ -102,6 +103,7 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
|
@ -220,7 +220,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||||
#define GGML_FILE_VERSION 1
|
#define GGML_FILE_VERSION 2
|
||||||
|
|
||||||
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
|
||||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
@ -231,6 +231,8 @@
|
|||||||
#define GGML_MAX_SRC 10
|
#define GGML_MAX_SRC 10
|
||||||
#ifndef GGML_MAX_NAME
|
#ifndef GGML_MAX_NAME
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
|
#define GGML_MAX_N_THREADS 512
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
@ -393,6 +395,8 @@ extern "C" {
|
|||||||
GGML_TYPE_Q4_0_4_4 = 31,
|
GGML_TYPE_Q4_0_4_4 = 31,
|
||||||
GGML_TYPE_Q4_0_4_8 = 32,
|
GGML_TYPE_Q4_0_4_8 = 32,
|
||||||
GGML_TYPE_Q4_0_8_8 = 33,
|
GGML_TYPE_Q4_0_8_8 = 33,
|
||||||
|
GGML_TYPE_TQ1_0 = 34,
|
||||||
|
GGML_TYPE_TQ2_0 = 35,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -453,6 +457,8 @@ extern "C" {
|
|||||||
GGML_OP_SQR,
|
GGML_OP_SQR,
|
||||||
GGML_OP_SQRT,
|
GGML_OP_SQRT,
|
||||||
GGML_OP_LOG,
|
GGML_OP_LOG,
|
||||||
|
GGML_OP_SIN,
|
||||||
|
GGML_OP_COS,
|
||||||
GGML_OP_SUM,
|
GGML_OP_SUM,
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
@ -490,9 +496,11 @@ extern "C" {
|
|||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_TRANSPOSE_1D,
|
GGML_OP_CONV_TRANSPOSE_1D,
|
||||||
GGML_OP_IM2COL,
|
GGML_OP_IM2COL,
|
||||||
|
GGML_OP_IM2COL_BACK,
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
GGML_OP_POOL_1D,
|
GGML_OP_POOL_1D,
|
||||||
GGML_OP_POOL_2D,
|
GGML_OP_POOL_2D,
|
||||||
|
GGML_OP_POOL_2D_BACK,
|
||||||
GGML_OP_UPSCALE, // nearest interpolate
|
GGML_OP_UPSCALE, // nearest interpolate
|
||||||
GGML_OP_PAD,
|
GGML_OP_PAD,
|
||||||
GGML_OP_ARANGE,
|
GGML_OP_ARANGE,
|
||||||
@ -508,6 +516,7 @@ extern "C" {
|
|||||||
GGML_OP_WIN_UNPART,
|
GGML_OP_WIN_UNPART,
|
||||||
GGML_OP_GET_REL_POS,
|
GGML_OP_GET_REL_POS,
|
||||||
GGML_OP_ADD_REL_POS,
|
GGML_OP_ADD_REL_POS,
|
||||||
|
GGML_OP_RWKV_WKV,
|
||||||
|
|
||||||
GGML_OP_UNARY,
|
GGML_OP_UNARY,
|
||||||
|
|
||||||
@ -542,6 +551,7 @@ extern "C" {
|
|||||||
GGML_UNARY_OP_SILU,
|
GGML_UNARY_OP_SILU,
|
||||||
GGML_UNARY_OP_HARDSWISH,
|
GGML_UNARY_OP_HARDSWISH,
|
||||||
GGML_UNARY_OP_HARDSIGMOID,
|
GGML_UNARY_OP_HARDSIGMOID,
|
||||||
|
GGML_UNARY_OP_EXP,
|
||||||
|
|
||||||
GGML_UNARY_OP_COUNT,
|
GGML_UNARY_OP_COUNT,
|
||||||
};
|
};
|
||||||
@ -624,6 +634,29 @@ extern "C" {
|
|||||||
// If it returns true, the computation is aborted
|
// If it returns true, the computation is aborted
|
||||||
typedef bool (*ggml_abort_callback)(void * data);
|
typedef bool (*ggml_abort_callback)(void * data);
|
||||||
|
|
||||||
|
// Scheduling priorities
|
||||||
|
enum ggml_sched_priority {
|
||||||
|
GGML_SCHED_PRIO_NORMAL,
|
||||||
|
GGML_SCHED_PRIO_MEDIUM,
|
||||||
|
GGML_SCHED_PRIO_HIGH,
|
||||||
|
GGML_SCHED_PRIO_REALTIME
|
||||||
|
};
|
||||||
|
|
||||||
|
// Threadpool params
|
||||||
|
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||||
|
struct ggml_threadpool_params {
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||||
|
int n_threads; // number of threads
|
||||||
|
enum ggml_sched_priority prio; // thread priority
|
||||||
|
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||||
|
bool strict_cpu; // strict cpu placement
|
||||||
|
bool paused; // start in paused state
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
struct ggml_cplan {
|
struct ggml_cplan {
|
||||||
@ -631,6 +664,7 @@ extern "C" {
|
|||||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
struct ggml_threadpool * threadpool;
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
@ -647,8 +681,8 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_hash_set {
|
struct ggml_hash_set {
|
||||||
size_t size;
|
size_t size;
|
||||||
ggml_bitset_t * used;
|
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||||
struct ggml_tensor ** keys;
|
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||||
};
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
@ -969,6 +1003,22 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sin(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_sin_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_cos(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_cos_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// return scalar
|
// return scalar
|
||||||
GGML_API struct ggml_tensor * ggml_sum(
|
GGML_API struct ggml_tensor * ggml_sum(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
@ -1119,6 +1169,14 @@ extern "C" {
|
|||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_exp(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_exp_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// normalize along rows
|
// normalize along rows
|
||||||
GGML_API struct ggml_tensor * ggml_norm(
|
GGML_API struct ggml_tensor * ggml_norm(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
@ -1214,7 +1272,7 @@ extern "C" {
|
|||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t nb2,
|
size_t nb2,
|
||||||
size_t nb3,
|
size_t nb3,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_set_inplace(
|
GGML_API struct ggml_tensor * ggml_set_inplace(
|
||||||
@ -1224,19 +1282,19 @@ extern "C" {
|
|||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t nb2,
|
size_t nb2,
|
||||||
size_t nb3,
|
size_t nb3,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_1d(
|
GGML_API struct ggml_tensor * ggml_set_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
GGML_API struct ggml_tensor * ggml_set_2d(
|
GGML_API struct ggml_tensor * ggml_set_2d(
|
||||||
@ -1244,7 +1302,7 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
||||||
@ -1252,7 +1310,7 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// a -> b, return view(b)
|
// a -> b, return view(b)
|
||||||
GGML_API struct ggml_tensor * ggml_cpy(
|
GGML_API struct ggml_tensor * ggml_cpy(
|
||||||
@ -1566,34 +1624,49 @@ extern "C" {
|
|||||||
float min,
|
float min,
|
||||||
float max);
|
float max);
|
||||||
|
|
||||||
|
// im2col
|
||||||
|
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
|
||||||
GGML_API struct ggml_tensor * ggml_im2col(
|
GGML_API struct ggml_tensor * ggml_im2col(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s0,
|
int s0, // stride dimension 0
|
||||||
int s1,
|
int s1, // stride dimension 1
|
||||||
int p0,
|
int p0, // padding dimension 0
|
||||||
int p1,
|
int p1, // padding dimension 1
|
||||||
int d0,
|
int d0, // dilation dimension 0
|
||||||
int d1,
|
int d1, // dilation dimension 1
|
||||||
bool is_2D,
|
bool is_2D,
|
||||||
enum ggml_type dst_type);
|
enum ggml_type dst_type);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_im2col_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a, // convolution kernel
|
||||||
|
struct ggml_tensor * b, // gradient of im2col output
|
||||||
|
int64_t * ne, // shape of im2col input
|
||||||
|
int s0, // stride dimension 0
|
||||||
|
int s1, // stride dimension 1
|
||||||
|
int p0, // padding dimension 0
|
||||||
|
int p1, // padding dimension 1
|
||||||
|
int d0, // dilation dimension 0
|
||||||
|
int d1, // dilation dimension 1
|
||||||
|
bool is_2D);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s0,
|
int s0, // stride dimension 0
|
||||||
int s1,
|
int s1, // stride dimension 1
|
||||||
int p0,
|
int p0, // padding dimension 0
|
||||||
int p1,
|
int p1, // padding dimension 1
|
||||||
int d0,
|
int d0, // dilation dimension 0
|
||||||
int d1);
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s0, // stride
|
int s0, // stride
|
||||||
int p0, // padding
|
int p0, // padding
|
||||||
int d0); // dilation
|
int d0); // dilation
|
||||||
@ -1602,29 +1675,29 @@ extern "C" {
|
|||||||
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
||||||
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s,
|
int s, // stride
|
||||||
int d);
|
int d); // dilation
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s0,
|
int s0, // stride
|
||||||
int p0,
|
int p0, // padding
|
||||||
int d0);
|
int d0); // dilation
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_2d(
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a, // convolution kernel
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b, // data
|
||||||
int s0,
|
int s0, // stride dimension 0
|
||||||
int s1,
|
int s1, // stride dimension 1
|
||||||
int p0,
|
int p0, // padding dimension 0
|
||||||
int p1,
|
int p1, // padding dimension 1
|
||||||
int d0,
|
int d0, // dilation dimension 0
|
||||||
int d1);
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
|
|
||||||
// kernel size is a->ne[0] x a->ne[1]
|
// kernel size is a->ne[0] x a->ne[1]
|
||||||
@ -1686,6 +1759,18 @@ extern "C" {
|
|||||||
float p0,
|
float p0,
|
||||||
float p1);
|
float p1);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_pool_2d_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * af, // "a"/input used in forward pass
|
||||||
|
enum ggml_op_pool op,
|
||||||
|
int k0,
|
||||||
|
int k1,
|
||||||
|
int s0,
|
||||||
|
int s1,
|
||||||
|
float p0,
|
||||||
|
float p1);
|
||||||
|
|
||||||
// nearest interpolate
|
// nearest interpolate
|
||||||
// multiplies ne0 and ne1 by scale factor
|
// multiplies ne0 and ne1 by scale factor
|
||||||
// used in stable-diffusion
|
// used in stable-diffusion
|
||||||
@ -1760,7 +1845,8 @@ extern "C" {
|
|||||||
struct ggml_tensor * v,
|
struct ggml_tensor * v,
|
||||||
struct ggml_tensor * mask,
|
struct ggml_tensor * mask,
|
||||||
float scale,
|
float scale,
|
||||||
float max_bias);
|
float max_bias,
|
||||||
|
float logit_softcap);
|
||||||
|
|
||||||
GGML_API void ggml_flash_attn_ext_set_prec(
|
GGML_API void ggml_flash_attn_ext_set_prec(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
@ -1777,10 +1863,8 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * s,
|
struct ggml_tensor * sx,
|
||||||
struct ggml_tensor * x,
|
struct ggml_tensor * c);
|
||||||
struct ggml_tensor * c,
|
|
||||||
struct ggml_tensor * sq);
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
GGML_API struct ggml_tensor * ggml_ssm_scan(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
@ -1789,8 +1873,7 @@ extern "C" {
|
|||||||
struct ggml_tensor * dt,
|
struct ggml_tensor * dt,
|
||||||
struct ggml_tensor * A,
|
struct ggml_tensor * A,
|
||||||
struct ggml_tensor * B,
|
struct ggml_tensor * B,
|
||||||
struct ggml_tensor * C,
|
struct ggml_tensor * C);
|
||||||
struct ggml_tensor * sq);
|
|
||||||
|
|
||||||
// partition into non-overlapping windows with padding if needed
|
// partition into non-overlapping windows with padding if needed
|
||||||
// example:
|
// example:
|
||||||
@ -1842,6 +1925,15 @@ extern "C" {
|
|||||||
struct ggml_tensor * pw,
|
struct ggml_tensor * pw,
|
||||||
struct ggml_tensor * ph);
|
struct ggml_tensor * ph);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * k,
|
||||||
|
struct ggml_tensor * v,
|
||||||
|
struct ggml_tensor * r,
|
||||||
|
struct ggml_tensor * tf,
|
||||||
|
struct ggml_tensor * td,
|
||||||
|
struct ggml_tensor * state);
|
||||||
|
|
||||||
// custom operators
|
// custom operators
|
||||||
|
|
||||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||||
@ -2012,10 +2104,23 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||||
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||||
|
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
|
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||||
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
const struct ggml_cgraph * cgraph,
|
||||||
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
|
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
@ -549,6 +549,13 @@ if (GGML_SYCL)
|
|||||||
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
|
||||||
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
|
||||||
|
|
||||||
|
find_package(DNNL)
|
||||||
|
message("-- DNNL found:" ${DNNL_FOUND})
|
||||||
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
|
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
|
||||||
|
else()
|
||||||
|
add_compile_definitions(GGML_SYCL_DNNL=0)
|
||||||
|
endif()
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
find_package(IntelSYCL REQUIRED)
|
find_package(IntelSYCL REQUIRED)
|
||||||
find_package(MKL REQUIRED)
|
find_package(MKL REQUIRED)
|
||||||
@ -561,6 +568,9 @@ if (GGML_SYCL)
|
|||||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
|
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_RPC)
|
if (GGML_RPC)
|
||||||
@ -602,6 +612,10 @@ if (GGML_VULKAN)
|
|||||||
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (GGML_VULKAN_SHADER_DEBUG_INFO)
|
||||||
|
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_VULKAN_PERF)
|
if (GGML_VULKAN_PERF)
|
||||||
add_compile_definitions(GGML_VULKAN_PERF)
|
add_compile_definitions(GGML_VULKAN_PERF)
|
||||||
endif()
|
endif()
|
||||||
@ -1237,7 +1251,7 @@ endif()
|
|||||||
|
|
||||||
# Data types, macros and functions related to controlling CPU affinity and
|
# Data types, macros and functions related to controlling CPU affinity and
|
||||||
# some memory allocation are available on Linux through GNU extensions in libc
|
# some memory allocation are available on Linux through GNU extensions in libc
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||||
add_compile_definitions(_GNU_SOURCE)
|
add_compile_definitions(_GNU_SOURCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -36,6 +36,84 @@
|
|||||||
// from bias offset form to pure sign form (this saves subtract
|
// from bias offset form to pure sign form (this saves subtract
|
||||||
// operations durin unpacking)
|
// operations durin unpacking)
|
||||||
//
|
//
|
||||||
|
#if defined(__AVX__)
|
||||||
|
#if defined(__F16C__)
|
||||||
|
// the _mm256_cvt intrinsics require F16C
|
||||||
|
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
||||||
|
#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) _mm256_cvtph_ps(_mm_shuffle_epi32(_mm_maskload_epi32((int const*)(x), loadMask), 68))
|
||||||
|
#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) _mm256_cvtph_ps(_mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask))
|
||||||
|
#else
|
||||||
|
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
||||||
|
float tmp[8];
|
||||||
|
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return _mm256_loadu_ps(tmp);
|
||||||
|
}
|
||||||
|
static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
|
||||||
|
float tmp[8];
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||||
|
tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return _mm256_loadu_ps(tmp);
|
||||||
|
}
|
||||||
|
static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrangeMask) {
|
||||||
|
uint16_t tmphalf[8];
|
||||||
|
float tmp[8];
|
||||||
|
|
||||||
|
_mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
|
||||||
|
for (int i = 0; i < 8; i++) {
|
||||||
|
tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return _mm256_loadu_ps(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
||||||
|
#define GGML_F32Cx8_REPEAT_LOAD(x, loadMask) __avx_repeat_f32cx8_load(x)
|
||||||
|
#define GGML_F32Cx8_REARRANGE_LOAD(x, arrangeMask) __avx_rearranged_f32cx8_load(x, arrangeMask)
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||||
|
static inline __m256i sum_i16_pairs_int(const __m256i x) {
|
||||||
|
const __m256i ones = _mm256_set1_epi16(1);
|
||||||
|
return _mm256_madd_epi16(ones, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __m256i mul_sum_us8_pairs_int(const __m256i ax, const __m256i sy) {
|
||||||
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
return _mm256_dpbusd_epi32(zero, ax, sy);
|
||||||
|
#else
|
||||||
|
// Perform multiplication and create 16-bit values
|
||||||
|
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
|
||||||
|
return sum_i16_pairs_int(dot);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Integer variant of the function defined in ggml-quants.c
|
||||||
|
// multiply int8_t, add results pairwise twice and return as float vector
|
||||||
|
static inline __m256i mul_sum_i8_pairs_int(const __m256i x, const __m256i y) {
|
||||||
|
#if __AVXVNNIINT8__
|
||||||
|
const __m256i zero = _mm256_setzero_si256();
|
||||||
|
return _mm256_dpbssd_epi32(zero, x, y);
|
||||||
|
#else
|
||||||
|
// Get absolute values of x vectors
|
||||||
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||||
|
// Sign the values of the y vectors
|
||||||
|
const __m256i sy = _mm256_sign_epi8(y, x);
|
||||||
|
return mul_sum_us8_pairs_int(ax, sy);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
|
||||||
block_q4_0x4 out;
|
block_q4_0x4 out;
|
||||||
|
|
||||||
@ -255,6 +333,103 @@ void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k)
|
|||||||
y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
|
y[i].qs[32 * j + 31] = vgetq_lane_s32(vi, 3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined(__AVX2__) || defined(__AVX__)
|
||||||
|
float id[4];
|
||||||
|
__m256 srcv[4][4];
|
||||||
|
__m256 idvec[4];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
for (int row_iter = 0; row_iter < 4; row_iter++) {
|
||||||
|
// Load elements into 4 AVX vectors
|
||||||
|
__m256 v0 = _mm256_loadu_ps( x + row_iter * k + i * 32 );
|
||||||
|
__m256 v1 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 8 );
|
||||||
|
__m256 v2 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 16 );
|
||||||
|
__m256 v3 = _mm256_loadu_ps( x + row_iter * k + i * 32 + 24 );
|
||||||
|
|
||||||
|
// Compute max(abs(e)) for the block
|
||||||
|
const __m256 signBit = _mm256_set1_ps( -0.0f );
|
||||||
|
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
|
||||||
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
|
||||||
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
|
||||||
|
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
|
||||||
|
|
||||||
|
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
|
||||||
|
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
|
||||||
|
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
|
||||||
|
const float maxScalar = _mm_cvtss_f32( max4 );
|
||||||
|
|
||||||
|
// Divided by 127.f to mirror results in quantize_row_q8_0
|
||||||
|
const float d = maxScalar / 127.f;
|
||||||
|
id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
|
||||||
|
|
||||||
|
// Store the scale for the individual block
|
||||||
|
y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
|
||||||
|
|
||||||
|
// Store the values in blocks of eight values - Aim is to use these later for block interleaving
|
||||||
|
srcv[row_iter][0] = v0;
|
||||||
|
srcv[row_iter][1] = v1;
|
||||||
|
srcv[row_iter][2] = v2;
|
||||||
|
srcv[row_iter][3] = v3;
|
||||||
|
idvec[row_iter] = _mm256_set1_ps(id[row_iter]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The loop iterates four times - The aim is to get 4 corresponding chunks of eight bytes from the original weight blocks that are interleaved
|
||||||
|
for (int j = 0; j < 4; j++) {
|
||||||
|
// Apply the multiplier
|
||||||
|
__m256 v0 = _mm256_mul_ps(srcv[0][j], idvec[0]);
|
||||||
|
__m256 v1 = _mm256_mul_ps(srcv[1][j], idvec[1]);
|
||||||
|
__m256 v2 = _mm256_mul_ps(srcv[2][j], idvec[2]);
|
||||||
|
__m256 v3 = _mm256_mul_ps(srcv[3][j], idvec[3]);
|
||||||
|
|
||||||
|
// Round to nearest integer
|
||||||
|
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
|
||||||
|
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
|
||||||
|
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
|
||||||
|
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
|
||||||
|
|
||||||
|
// Convert floats to integers
|
||||||
|
__m256i i0 = _mm256_cvtps_epi32( v0 );
|
||||||
|
__m256i i1 = _mm256_cvtps_epi32( v1 );
|
||||||
|
__m256i i2 = _mm256_cvtps_epi32( v2 );
|
||||||
|
__m256i i3 = _mm256_cvtps_epi32( v3 );
|
||||||
|
|
||||||
|
#if defined(__AVX2__)
|
||||||
|
// Convert int32 to int16
|
||||||
|
i0 = _mm256_packs_epi32( i0, i1 );
|
||||||
|
i2 = _mm256_packs_epi32( i2, i3 );
|
||||||
|
// Convert int16 to int8
|
||||||
|
i0 = _mm256_packs_epi16( i0, i2 );
|
||||||
|
|
||||||
|
// Permute and store the quantized weights in the required order after the pack instruction
|
||||||
|
const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
|
||||||
|
i0 = _mm256_permutevar8x32_epi32( i0, perm );
|
||||||
|
|
||||||
|
_mm256_storeu_si256((__m256i *)(y[i].qs + 32 * j), i0);
|
||||||
|
#else
|
||||||
|
// Since we don't have in AVX some necessary functions,
|
||||||
|
// we split the registers in half and call AVX2 analogs from SSE
|
||||||
|
__m128i ni0 = _mm256_castsi256_si128( i0 );
|
||||||
|
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
|
||||||
|
__m128i ni2 = _mm256_castsi256_si128( i1 );
|
||||||
|
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
|
||||||
|
__m128i ni4 = _mm256_castsi256_si128( i2 );
|
||||||
|
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
|
||||||
|
__m128i ni6 = _mm256_castsi256_si128( i3 );
|
||||||
|
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
|
||||||
|
|
||||||
|
// Convert int32 to int16
|
||||||
|
ni0 = _mm_packs_epi32( ni0, ni1 );
|
||||||
|
ni2 = _mm_packs_epi32( ni2, ni3 );
|
||||||
|
ni4 = _mm_packs_epi32( ni4, ni5 );
|
||||||
|
ni6 = _mm_packs_epi32( ni6, ni7 );
|
||||||
|
// Convert int16 to int8
|
||||||
|
ni0 = _mm_packs_epi16( ni0, ni2 );
|
||||||
|
ni4 = _mm_packs_epi16( ni4, ni6 );
|
||||||
|
_mm_storeu_si128((__m128i *)(y[i].qs + 32 * j), ni0);
|
||||||
|
_mm_storeu_si128((__m128i *)(y[i].qs + 32 * j + 16), ni4);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
// scalar
|
// scalar
|
||||||
const int blck_size_interleave = 8;
|
const int blck_size_interleave = 8;
|
||||||
@ -337,33 +512,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
|
|||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
UNUSED(quant_weights);
|
||||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(false);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
UNUSED(quant_weights);
|
||||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(false);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
||||||
if (!quant_weights) {
|
UNUSED(quant_weights);
|
||||||
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(false);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
|
||||||
@ -699,6 +859,96 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
// Lookup table to convert signed nibbles to signed bytes
|
||||||
|
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
|
||||||
|
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
|
||||||
|
__m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||||
|
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
|
||||||
|
|
||||||
|
// Permute mask used for easier vector processing at later stages
|
||||||
|
const __m256i m4b = _mm256_set1_epi8(0x0F);
|
||||||
|
|
||||||
|
int64_t b_nb = n / QK4_0;
|
||||||
|
|
||||||
|
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
|
||||||
|
const block_q8_0 * a_ptr_start = (const block_q8_0 *)vy;
|
||||||
|
|
||||||
|
// Process Q8_0 blocks one by one
|
||||||
|
for (int64_t y = 0; y < nr; y++) {
|
||||||
|
|
||||||
|
// Pointers to LHS blocks of block_q8_0 format
|
||||||
|
const block_q8_0 * a_ptr = a_ptr_start + (y * nb);
|
||||||
|
|
||||||
|
// Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
|
||||||
|
for (int64_t x = 0; x < nc / 8; x++) {
|
||||||
|
|
||||||
|
// Pointers to RHS blocks
|
||||||
|
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
|
||||||
|
|
||||||
|
// Master FP accumulator
|
||||||
|
__m256 acc_row = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
for (int64_t b = 0; b < nb; b++) {
|
||||||
|
// Load 8 blocks of Q4_0 interleaved as 8 bytes (B0 - B7)
|
||||||
|
const __m256i rhs_raw_vec_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
|
||||||
|
const __m256i rhs_raw_vec_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 1);
|
||||||
|
const __m256i rhs_raw_vec_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 2);
|
||||||
|
const __m256i rhs_raw_vec_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs) + 3);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit - Sign is maintained
|
||||||
|
const __m256i rhs_vec_0123_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_0, m4b)); // B0(0-7) B1(0-7) B2(0-7) B3(0-7)
|
||||||
|
const __m256i rhs_vec_4567_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_0, m4b)); // B4(0-7) B5(0-7) B6(0-7) B7(0-7)
|
||||||
|
const __m256i rhs_vec_0123_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_0123_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
|
||||||
|
const __m256i rhs_vec_4567_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_vec_4567_1, m4b)); // B0(8-15) B1(8-15) B2(8-15) B3(8-15)
|
||||||
|
|
||||||
|
const __m256i rhs_vec_0123_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_0, 4), m4b)); // B0(16-23) B1(16-23) B2(16-23) B3(16-23)
|
||||||
|
const __m256i rhs_vec_4567_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_0, 4), m4b)); // B4(16-23) B5(16-23) B6(16-23) B7(16-23)
|
||||||
|
const __m256i rhs_vec_0123_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_0123_1, 4), m4b)); // B0(24-31) B1(24-31) B2(24-31) B3(24-31)
|
||||||
|
const __m256i rhs_vec_4567_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_vec_4567_1, 4), m4b)); // B4(24-31) B5(24-31) B6(24-31) B7(24-31)
|
||||||
|
|
||||||
|
// Load the scale values for the 8 blocks interleaved in block_q4_0x8
|
||||||
|
const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
||||||
|
|
||||||
|
// Load and convert to FP32 scale from block_q8_0
|
||||||
|
const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
|
||||||
|
|
||||||
|
// Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
|
||||||
|
__m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
|
||||||
|
__m256i lhs_vec_1 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(a_ptr[b].qs + 16)));
|
||||||
|
|
||||||
|
lhs_vec_0 = _mm256_permute2f128_si256(lhs_vec_0, lhs_vec_0, 0); // A0 (0-15) A0(0-15)
|
||||||
|
lhs_vec_1 = _mm256_permute2f128_si256(lhs_vec_1, lhs_vec_1, 0); // A0 (16-31) A0(16-31))
|
||||||
|
|
||||||
|
__m256i iacc = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
// Dot product done within 32 bit lanes and accumulated in the same vector
|
||||||
|
// B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
|
||||||
|
// B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
|
||||||
|
// ...........................................................................
|
||||||
|
// B0(28-31) B4(28-31) B1(28-31) B5(28-31) B2(28-31) B6(28-31) B3(28-31) B7(28-31) with A0(28-31)
|
||||||
|
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_0 ,_mm256_shuffle_epi32(rhs_vec_4567_0, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 0)));
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_0, 177) ,rhs_vec_4567_0, 170), _mm256_shuffle_epi32(lhs_vec_0, 85)));
|
||||||
|
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_1 ,_mm256_shuffle_epi32(rhs_vec_4567_1, 177), 170), _mm256_shuffle_epi32(lhs_vec_0, 170)));
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_1, 177) ,rhs_vec_4567_1, 170), _mm256_shuffle_epi32(lhs_vec_0, 255)));
|
||||||
|
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_2 ,_mm256_shuffle_epi32(rhs_vec_4567_2, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 0)));
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_2, 177) ,rhs_vec_4567_2, 170), _mm256_shuffle_epi32(lhs_vec_1, 85)));
|
||||||
|
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170)));
|
||||||
|
iacc = _mm256_add_epi32(iacc, mul_sum_i8_pairs_int(_mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255)));
|
||||||
|
|
||||||
|
// Accumulated values multipled with appropriate scales
|
||||||
|
acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Accumulated output values permuted so as to be stored in appropriate order post accumulation
|
||||||
|
acc_row = _mm256_permutevar8x32_ps(acc_row, finalpermutemask);
|
||||||
|
_mm256_storeu_ps(s + (y * nr + x * 8), acc_row);
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
float sumf[8];
|
float sumf[8];
|
||||||
int sumi;
|
int sumi;
|
||||||
@ -2158,6 +2408,353 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT((ggml_cpu_has_sve() || ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
"__ARM_FEATURE_SVE and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 quantization format for optimal "
|
||||||
"performance");
|
"performance");
|
||||||
|
#elif defined(__AVX2__) || defined(__AVX512F__)
|
||||||
|
const block_q4_0x8 * b_ptr_start = (const block_q4_0x8 *)vx;
|
||||||
|
const block_q8_0x4 * a_ptr_start = (const block_q8_0x4 *)vy;
|
||||||
|
int64_t b_nb = n / QK4_0;
|
||||||
|
int64_t y = 0;
|
||||||
|
// Mask to mask out nibbles from packed bytes
|
||||||
|
const __m256i m4b = _mm256_set1_epi8(0x0F);
|
||||||
|
const __m128i loadMask = _mm_blend_epi32(_mm_setzero_si128(), _mm_set1_epi32(0xFFFFFFFF), 3);
|
||||||
|
// Lookup table to convert signed nibbles to signed bytes
|
||||||
|
__m256i signextendlut = _mm256_castsi128_si256(_mm_set_epi8(-1, -2, -3, -4, -5, -6, -7, -8, 7, 6, 5, 4, 3, 2, 1, 0));
|
||||||
|
signextendlut = _mm256_permute2f128_si256(signextendlut, signextendlut, 0);
|
||||||
|
// Permute mask used for easier vector processing at later stages
|
||||||
|
__m256i requiredOrder = _mm256_set_epi32(3 ,2 ,1 ,0, 7 ,6, 5, 4);
|
||||||
|
|
||||||
|
// Take group of four block_q8_0x4 structures at each pass of the loop and perform dot product operation
|
||||||
|
int anr = nr - nr %16; // Used to align nr with boundary of 16
|
||||||
|
|
||||||
|
for (; y < anr / 4; y += 4) {
|
||||||
|
const block_q8_0x4 * a_ptrs[4];
|
||||||
|
|
||||||
|
a_ptrs[0] = a_ptr_start + (y * nb);
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
a_ptrs[i + 1] = a_ptrs[i] + nb;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Take group of eight block_q4_0x8 structures at each pass of the loop and perform dot product operation
|
||||||
|
for (int64_t x = 0; x < nc / 8; x++) {
|
||||||
|
|
||||||
|
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
|
||||||
|
|
||||||
|
// Master FP accumulators
|
||||||
|
__m256 acc_rows[16];
|
||||||
|
for (int i = 0; i < 16; i++) {
|
||||||
|
acc_rows[i] = _mm256_setzero_ps();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int64_t b = 0; b < nb; b++) {
|
||||||
|
// Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
|
||||||
|
const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
|
||||||
|
const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
|
||||||
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
||||||
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
||||||
|
|
||||||
|
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
|
||||||
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
||||||
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
||||||
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
||||||
|
const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit - Sign is maintained
|
||||||
|
const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
|
||||||
|
const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
|
||||||
|
const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
|
||||||
|
const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
|
||||||
|
const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
|
||||||
|
|
||||||
|
// Shuffle pattern one - right side input
|
||||||
|
const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
|
||||||
|
const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
|
||||||
|
const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
|
||||||
|
const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
|
||||||
|
const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
|
||||||
|
|
||||||
|
// Shuffle pattern two - right side input
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
|
||||||
|
const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
|
||||||
|
const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
|
||||||
|
const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
|
||||||
|
const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
|
||||||
|
|
||||||
|
// Scale values - Load the wight scale values of block_q4_0x8
|
||||||
|
const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
|
||||||
|
|
||||||
|
// Process LHS in groups of four
|
||||||
|
for (int rp = 0; rp < 4; rp++) {
|
||||||
|
// Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
|
||||||
|
// Loaded as set of 128 bit vectors and repeated into a 256 bit vector
|
||||||
|
__m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs)));
|
||||||
|
__m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
|
||||||
|
__m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
|
||||||
|
__m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 32)));
|
||||||
|
__m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
|
||||||
|
__m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
|
||||||
|
__m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 64)));
|
||||||
|
__m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
|
||||||
|
__m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
|
||||||
|
__m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptrs[rp][b].qs + 96)));
|
||||||
|
__m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
|
||||||
|
__m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
|
||||||
|
|
||||||
|
// Shuffle pattern one - left side input
|
||||||
|
const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||||
|
const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||||
|
const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||||
|
const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||||
|
const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||||
|
|
||||||
|
// Shuffle pattern two - left side input
|
||||||
|
const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||||
|
const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||||
|
const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||||
|
const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||||
|
const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||||
|
|
||||||
|
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
|
||||||
|
// Resembles MMLAs into 2x2 matrices in ARM Version
|
||||||
|
__m256i iacc_mat_00_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
|
||||||
|
__m256i iacc_mat_01_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
|
||||||
|
__m256i iacc_mat_10_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
|
||||||
|
__m256i iacc_mat_11_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
|
||||||
|
__m256i iacc_mat_00_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
|
||||||
|
__m256i iacc_mat_01_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
|
||||||
|
__m256i iacc_mat_10_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
|
||||||
|
__m256i iacc_mat_11_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
|
||||||
|
|
||||||
|
// Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
|
||||||
|
__m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
|
||||||
|
__m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
|
||||||
|
__m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
|
||||||
|
__m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
|
||||||
|
|
||||||
|
// Straighten out to make 4 row vectors
|
||||||
|
__m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
|
||||||
|
__m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
|
||||||
|
__m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
|
||||||
|
__m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
|
||||||
|
|
||||||
|
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||||
|
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
|
||||||
|
|
||||||
|
// Multiply with appropiate scales and accumulate
|
||||||
|
acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
|
||||||
|
acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
|
||||||
|
acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
|
||||||
|
acc_rows[rp * 4 + 3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[rp * 4 + 3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the accumulated values
|
||||||
|
for (int i = 0; i < 16; i++) {
|
||||||
|
_mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Take a block_q8_0x4 structures at each pass of the loop and perform dot product operation
|
||||||
|
for (; y < nr / 4; y ++) {
|
||||||
|
|
||||||
|
const block_q8_0x4 * a_ptr = a_ptr_start + (y * nb);
|
||||||
|
|
||||||
|
// Load the eight block_q4_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
|
||||||
|
for (int64_t x = 0; x < nc / 8; x++) {
|
||||||
|
|
||||||
|
const block_q4_0x8 * b_ptr = b_ptr_start + (x * b_nb);
|
||||||
|
|
||||||
|
// Master FP accumulators
|
||||||
|
__m256 acc_rows[4];
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
acc_rows[i] = _mm256_setzero_ps();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int64_t b = 0; b < nb; b++) {
|
||||||
|
// Load the eight block_q8_0 quantized values interleaved with each other in chunks of eight - B0,B1 ....B6,B7
|
||||||
|
const __m256i rhs_raw_mat_0123_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs));
|
||||||
|
const __m256i rhs_raw_mat_4567_0 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 32));
|
||||||
|
const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
|
||||||
|
const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
|
||||||
|
|
||||||
|
// Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
|
||||||
|
const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
|
||||||
|
const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
|
||||||
|
const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
|
||||||
|
const __m256i rhs_raw_mat_2367_1 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_1, requiredOrder), rhs_raw_mat_4567_1, 240);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit - Sign is maintained
|
||||||
|
const __m256i rhs_mat_0145_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_0, m4b)); //B0(0-7) B1(0-7) B4(0-7) B5(0-7)
|
||||||
|
const __m256i rhs_mat_2367_0 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_0, m4b)); //B2(0-7) B3(0-7) B6(0-7) B7(0-7)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_0145_1, m4b)); //B0(8-15) B1(8-15) B4(8-15) B5(8-15)
|
||||||
|
const __m256i rhs_mat_2367_1 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(rhs_raw_mat_2367_1, m4b)); //B2(8-15) B3(8-15) B6(8-15) B7(8-15)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_0, 4), m4b)); //B0(16-23) B1(16-23) B4(16-23) B5(16-23)
|
||||||
|
const __m256i rhs_mat_2367_2 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_0, 4), m4b)); //B2(16-23) B3(16-23) B6(16-23) B7(16-23)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_0145_1, 4), m4b)); //B0(24-31) B1(24-31) B4(24-31) B5(24-31)
|
||||||
|
const __m256i rhs_mat_2367_3 = _mm256_shuffle_epi8(signextendlut, _mm256_and_si256(_mm256_srli_epi16(rhs_raw_mat_2367_1, 4), m4b)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31)
|
||||||
|
|
||||||
|
// Shuffle pattern one - right side input
|
||||||
|
const __m256i rhs_mat_0145_0_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3)
|
||||||
|
const __m256i rhs_mat_2367_0_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11)
|
||||||
|
const __m256i rhs_mat_2367_1_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19)
|
||||||
|
const __m256i rhs_mat_2367_2_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3_sp1 = _mm256_shuffle_epi32(rhs_mat_0145_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27)
|
||||||
|
const __m256i rhs_mat_2367_3_sp1 = _mm256_shuffle_epi32(rhs_mat_2367_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27)
|
||||||
|
|
||||||
|
// Shuffle pattern two - right side input
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_0_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7)
|
||||||
|
const __m256i rhs_mat_2367_0_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_1_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15)
|
||||||
|
const __m256i rhs_mat_2367_1_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_2_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23)
|
||||||
|
const __m256i rhs_mat_2367_2_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23)
|
||||||
|
|
||||||
|
const __m256i rhs_mat_0145_3_sp2 = _mm256_shuffle_epi32(rhs_mat_0145_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31)
|
||||||
|
const __m256i rhs_mat_2367_3_sp2 = _mm256_shuffle_epi32(rhs_mat_2367_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31)
|
||||||
|
|
||||||
|
// Scale values - Load the wight scale values of block_q4_0x8
|
||||||
|
const __m256 col_scale_f32 = GGML_F32Cx8_LOAD(b_ptr[b].d);
|
||||||
|
|
||||||
|
// Load the four block_q4_0 quantized values interleaved with each other in chunks of eight - A0,A1,A2,A3
|
||||||
|
// Loaded as set of 128 bit vectors and repeated into a 256 bit vector
|
||||||
|
__m256i lhs_mat_0123_0 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs)));
|
||||||
|
__m256i lhs_mat_01_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 0);
|
||||||
|
__m256i lhs_mat_23_0 = _mm256_permute2f128_si256(lhs_mat_0123_0, lhs_mat_0123_0, 17);
|
||||||
|
__m256i lhs_mat_0123_1 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 32)));
|
||||||
|
__m256i lhs_mat_01_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 0);
|
||||||
|
__m256i lhs_mat_23_1 = _mm256_permute2f128_si256(lhs_mat_0123_1, lhs_mat_0123_1, 17);
|
||||||
|
__m256i lhs_mat_0123_2 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 64)));
|
||||||
|
__m256i lhs_mat_01_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 0);
|
||||||
|
__m256i lhs_mat_23_2 = _mm256_permute2f128_si256(lhs_mat_0123_2, lhs_mat_0123_2, 17);
|
||||||
|
__m256i lhs_mat_0123_3 = _mm256_loadu_si256((const __m256i *)((a_ptr[b].qs + 96)));
|
||||||
|
__m256i lhs_mat_01_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 0);
|
||||||
|
__m256i lhs_mat_23_3 = _mm256_permute2f128_si256(lhs_mat_0123_3, lhs_mat_0123_3, 17);
|
||||||
|
|
||||||
|
// Shuffle pattern one - left side input
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_0_sp1 = _mm256_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3)
|
||||||
|
const __m256i lhs_mat_23_0_sp1 = _mm256_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_1_sp1 = _mm256_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11)
|
||||||
|
const __m256i lhs_mat_23_1_sp1 = _mm256_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_2_sp1 = _mm256_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19)
|
||||||
|
const __m256i lhs_mat_23_2_sp1 = _mm256_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_3_sp1 = _mm256_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27)
|
||||||
|
const __m256i lhs_mat_23_3_sp1 = _mm256_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27)
|
||||||
|
|
||||||
|
// Shuffle pattern two - left side input
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_0_sp2 = _mm256_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7)
|
||||||
|
const __m256i lhs_mat_23_0_sp2 = _mm256_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_1_sp2 = _mm256_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15)
|
||||||
|
const __m256i lhs_mat_23_1_sp2 = _mm256_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_2_sp2 = _mm256_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23)
|
||||||
|
const __m256i lhs_mat_23_2_sp2 = _mm256_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23)
|
||||||
|
|
||||||
|
const __m256i lhs_mat_01_3_sp2 = _mm256_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31)
|
||||||
|
const __m256i lhs_mat_23_3_sp2 = _mm256_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31)
|
||||||
|
|
||||||
|
// The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane
|
||||||
|
// Resembles MMLAs into 2x2 matrices in ARM Version
|
||||||
|
__m256i iacc_mat_00_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_0145_0_sp1));
|
||||||
|
__m256i iacc_mat_01_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_01_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp1, rhs_mat_2367_0_sp1));
|
||||||
|
__m256i iacc_mat_10_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_0145_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_0145_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_0145_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_0145_0_sp1));
|
||||||
|
__m256i iacc_mat_11_sp1 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp1, rhs_mat_2367_3_sp1), mul_sum_i8_pairs_int(lhs_mat_23_2_sp1, rhs_mat_2367_2_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp1, rhs_mat_2367_1_sp1)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp1, rhs_mat_2367_0_sp1));
|
||||||
|
__m256i iacc_mat_00_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_0145_0_sp2));
|
||||||
|
__m256i iacc_mat_01_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_01_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_01_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_01_0_sp2, rhs_mat_2367_0_sp2));
|
||||||
|
__m256i iacc_mat_10_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_0145_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_0145_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_0145_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_0145_0_sp2));
|
||||||
|
__m256i iacc_mat_11_sp2 =
|
||||||
|
_mm256_add_epi32(_mm256_add_epi32(_mm256_add_epi32(mul_sum_i8_pairs_int(lhs_mat_23_3_sp2, rhs_mat_2367_3_sp2), mul_sum_i8_pairs_int(lhs_mat_23_2_sp2, rhs_mat_2367_2_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_1_sp2, rhs_mat_2367_1_sp2)), mul_sum_i8_pairs_int(lhs_mat_23_0_sp2, rhs_mat_2367_0_sp2));
|
||||||
|
|
||||||
|
// Output of both shuffle patterns are added in order to sum dot product outputs of all 32 values in block
|
||||||
|
__m256i iacc_mat_00 = _mm256_add_epi32(iacc_mat_00_sp1, iacc_mat_00_sp2);
|
||||||
|
__m256i iacc_mat_01 = _mm256_add_epi32(iacc_mat_01_sp1, iacc_mat_01_sp2);
|
||||||
|
__m256i iacc_mat_10 = _mm256_add_epi32(iacc_mat_10_sp1, iacc_mat_10_sp2);
|
||||||
|
__m256i iacc_mat_11 = _mm256_add_epi32(iacc_mat_11_sp1, iacc_mat_11_sp2);
|
||||||
|
|
||||||
|
|
||||||
|
// Straighten out to make 4 row vectors
|
||||||
|
__m256i iacc_row_0 = _mm256_blend_epi32(iacc_mat_00, _mm256_shuffle_epi32(iacc_mat_01, 78), 204);
|
||||||
|
__m256i iacc_row_1 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01, 204);
|
||||||
|
__m256i iacc_row_2 = _mm256_blend_epi32(iacc_mat_10, _mm256_shuffle_epi32(iacc_mat_11, 78), 204);
|
||||||
|
__m256i iacc_row_3 = _mm256_blend_epi32(_mm256_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11, 204);
|
||||||
|
|
||||||
|
// Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
|
||||||
|
const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
|
||||||
|
|
||||||
|
// Multiply with appropiate scales and accumulate
|
||||||
|
acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
|
||||||
|
acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
|
||||||
|
acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
|
||||||
|
acc_rows[3] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_3), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_rows[3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the accumulated values
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
_mm256_storeu_ps((float *)(s + ((y * 4 + i) * bs + x * 8)), acc_rows[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
float sumf[4][8];
|
float sumf[4][8];
|
||||||
int sumi;
|
int sumi;
|
||||||
|
@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
ggml_threadpool_t threadpool;
|
||||||
size_t work_size;
|
|
||||||
|
void * work_data;
|
||||||
|
size_t work_size;
|
||||||
|
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
|||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
|||||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
free(cpu_ctx->work_data);
|
free(cpu_ctx->work_data);
|
||||||
@ -825,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
|||||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
|
case GGML_OP_ROPE_BACK:
|
||||||
|
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||||
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -873,6 +879,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
ctx->threadpool = NULL;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
ctx->abort_callback = NULL;
|
ctx->abort_callback = NULL;
|
||||||
@ -903,6 +910,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
|||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
|
||||||
|
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||||
|
// already had a different threadpool, pause/suspend it before switching
|
||||||
|
ggml_threadpool_pause(ctx->threadpool);
|
||||||
|
}
|
||||||
|
ctx->threadpool = threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
@ -1150,6 +1169,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||||
|
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||||
|
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
||||||
|
}
|
||||||
|
|
||||||
// graph input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
@ -1629,7 +1653,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
sched->prev_leaf_backend_ids = tmp;
|
sched->prev_leaf_backend_ids = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
||||||
if (sched->graph.size < graph_size) {
|
if (sched->graph.size < graph_size) {
|
||||||
sched->graph.size = graph_size;
|
sched->graph.size = graph_size;
|
||||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||||
@ -1681,6 +1705,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1694,6 +1719,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1704,6 +1730,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
|||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||||
|
assert(graph_copy->size > graph_copy->n_leafs);
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
|
|||||||
# title of most generated pages and in a few other places.
|
# title of most generated pages and in a few other places.
|
||||||
# The default value is: My Project.
|
# The default value is: My Project.
|
||||||
|
|
||||||
PROJECT_NAME = "llama.cpp"
|
PROJECT_NAME = "ggml"
|
||||||
|
|
||||||
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
||||||
# could be handy for archiving the generated documentation or if some version
|
# could be handy for archiving the generated documentation or if some version
|
||||||
@ -44,7 +44,7 @@ PROJECT_NUMBER =
|
|||||||
# for a project that appears at the top of each page and should give viewer a
|
# for a project that appears at the top of each page and should give viewer a
|
||||||
# quick idea about the purpose of the project. Keep the description short.
|
# quick idea about the purpose of the project. Keep the description short.
|
||||||
|
|
||||||
PROJECT_BRIEF = "llama inference engine"
|
PROJECT_BRIEF = "Tensor library for machine learning"
|
||||||
|
|
||||||
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
||||||
# in the documentation. The maximum height of the logo should not exceed 55
|
# in the documentation. The maximum height of the logo should not exceed 55
|
||||||
|
@ -227,6 +227,25 @@ typedef struct {
|
|||||||
} block_q8_0x8;
|
} block_q8_0x8;
|
||||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||||
|
|
||||||
|
//
|
||||||
|
// Ternary quantization
|
||||||
|
//
|
||||||
|
|
||||||
|
// 1.6875 bpw
|
||||||
|
typedef struct {
|
||||||
|
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
||||||
|
uint8_t qh[QK_K/64]; // 4 elements per byte
|
||||||
|
ggml_half d;
|
||||||
|
} block_tq1_0;
|
||||||
|
static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
||||||
|
|
||||||
|
// 2.0625 bpw
|
||||||
|
typedef struct {
|
||||||
|
uint8_t qs[QK_K/4]; // 2 bits per element
|
||||||
|
ggml_half d;
|
||||||
|
} block_tq2_0;
|
||||||
|
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
//
|
//
|
||||||
@ -361,6 +380,7 @@ typedef struct {
|
|||||||
} block_iq3_s;
|
} block_iq3_s;
|
||||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||||
|
|
||||||
|
// 1.5625 bpw
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint8_t qs[QK_K/8];
|
uint8_t qs[QK_K/8];
|
||||||
|
@ -9,8 +9,10 @@
|
|||||||
#include "ggml-cuda/binbcast.cuh"
|
#include "ggml-cuda/binbcast.cuh"
|
||||||
#include "ggml-cuda/clamp.cuh"
|
#include "ggml-cuda/clamp.cuh"
|
||||||
#include "ggml-cuda/concat.cuh"
|
#include "ggml-cuda/concat.cuh"
|
||||||
|
#include "ggml-cuda/conv-transpose-1d.cuh"
|
||||||
#include "ggml-cuda/convert.cuh"
|
#include "ggml-cuda/convert.cuh"
|
||||||
#include "ggml-cuda/cpy.cuh"
|
#include "ggml-cuda/cpy.cuh"
|
||||||
|
#include "ggml-cuda/cross-entropy-loss.cuh"
|
||||||
#include "ggml-cuda/diagmask.cuh"
|
#include "ggml-cuda/diagmask.cuh"
|
||||||
#include "ggml-cuda/dmmv.cuh"
|
#include "ggml-cuda/dmmv.cuh"
|
||||||
#include "ggml-cuda/fattn.cuh"
|
#include "ggml-cuda/fattn.cuh"
|
||||||
@ -25,11 +27,11 @@
|
|||||||
#include "ggml-cuda/rope.cuh"
|
#include "ggml-cuda/rope.cuh"
|
||||||
#include "ggml-cuda/scale.cuh"
|
#include "ggml-cuda/scale.cuh"
|
||||||
#include "ggml-cuda/softmax.cuh"
|
#include "ggml-cuda/softmax.cuh"
|
||||||
|
#include "ggml-cuda/sum.cuh"
|
||||||
#include "ggml-cuda/sumrows.cuh"
|
#include "ggml-cuda/sumrows.cuh"
|
||||||
#include "ggml-cuda/tsembd.cuh"
|
#include "ggml-cuda/tsembd.cuh"
|
||||||
#include "ggml-cuda/unary.cuh"
|
#include "ggml-cuda/unary.cuh"
|
||||||
#include "ggml-cuda/upscale.cuh"
|
#include "ggml-cuda/upscale.cuh"
|
||||||
#include "ggml-cuda/conv-transpose-1d.cuh"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
@ -2179,8 +2181,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
ggml_cuda_dup(ctx, dst);
|
ggml_cuda_dup(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||||
ggml_cuda_op_add(ctx, dst);
|
ggml_cuda_op_add(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_SUB:
|
||||||
|
ggml_cuda_op_sub(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
ggml_cuda_op_acc(ctx, dst);
|
ggml_cuda_op_acc(ctx, dst);
|
||||||
break;
|
break;
|
||||||
@ -2192,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(dst)) {
|
switch (ggml_get_unary_op(dst)) {
|
||||||
|
case GGML_UNARY_OP_NEG:
|
||||||
|
ggml_cuda_op_neg(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
ggml_cuda_op_gelu(ctx, dst);
|
ggml_cuda_op_gelu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
@ -2267,6 +2276,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
case GGML_OP_SQRT:
|
case GGML_OP_SQRT:
|
||||||
ggml_cuda_op_sqrt(ctx, dst);
|
ggml_cuda_op_sqrt(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_SIN:
|
||||||
|
ggml_cuda_op_sin(ctx, dst);
|
||||||
|
break;
|
||||||
|
case GGML_OP_COS:
|
||||||
|
ggml_cuda_op_cos(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
ggml_cuda_op_clamp(ctx, dst);
|
ggml_cuda_op_clamp(ctx, dst);
|
||||||
break;
|
break;
|
||||||
@ -2294,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
ggml_cuda_op_pool2d(ctx, dst);
|
ggml_cuda_op_pool2d(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_SUM:
|
||||||
|
ggml_cuda_op_sum(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
ggml_cuda_op_sum_rows(ctx, dst);
|
ggml_cuda_op_sum_rows(ctx, dst);
|
||||||
break;
|
break;
|
||||||
@ -2303,6 +2321,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
case GGML_OP_FLASH_ATTN_EXT:
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
ggml_cuda_flash_attn_ext(ctx, dst);
|
ggml_cuda_flash_attn_ext(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
|
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -2559,8 +2580,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||||
// store a pointer to each copy op CUDA kernel to identify it later
|
// store a pointer to each copy op CUDA kernel to identify it later
|
||||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
if (!ptr) {
|
||||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
use_cuda_graph = false;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||||
|
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2610,6 +2638,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
|
assert(node->src[j]->buffer);
|
||||||
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2727,6 +2756,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
case GGML_UNARY_OP_NEG:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
@ -2828,6 +2858,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
@ -2853,21 +2886,29 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_ADD1:
|
||||||
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_SQRT:
|
case GGML_OP_SQRT:
|
||||||
|
case GGML_OP_SIN:
|
||||||
|
case GGML_OP_COS:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
|
return true;
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
return op->src[0]->type != GGML_TYPE_BF16;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
|
case GGML_OP_SUM:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
@ -2890,6 +2931,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||||||
}
|
}
|
||||||
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
|
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
|
||||||
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||||
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
|
return true;
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
|
|||||||
return a + b;
|
return a + b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float op_sub(const float a, const float b) {
|
||||||
|
return a - b;
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
static __device__ __forceinline__ float op_mul(const float a, const float b) {
|
||||||
return a * b;
|
return a * b;
|
||||||
}
|
}
|
||||||
@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||||
}
|
}
|
||||||
|
@ -2,5 +2,6 @@
|
|||||||
|
|
||||||
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
@ -428,7 +428,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||||||
char * src0_ddc = (char *) src0->data;
|
char * src0_ddc = (char *) src0->data;
|
||||||
char * src1_ddc = (char *) src1->data;
|
char * src1_ddc = (char *) src1->data;
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
|
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||||
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
@ -449,9 +452,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -461,29 +463,30 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
return nullptr;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
106
ggml/src/ggml-cuda/cross-entropy-loss.cu
Normal file
106
ggml/src/ggml-cuda/cross-entropy-loss.cu
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#include "common.cuh"
|
||||||
|
#include "cross-entropy-loss.cuh"
|
||||||
|
#include "sum.cuh"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
|
||||||
|
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||||
|
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||||
|
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
|
||||||
|
|
||||||
|
const int ne_tmp = WARP_SIZE*nclasses;
|
||||||
|
|
||||||
|
extern __shared__ float tmp_all[];
|
||||||
|
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
|
||||||
|
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
|
||||||
|
|
||||||
|
// Each warp first loads ne_tmp logits/labels into shared memory:
|
||||||
|
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
|
||||||
|
const int ig = i0*nclasses + i; // ig == i global
|
||||||
|
|
||||||
|
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
|
||||||
|
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Each thread in the warp then calculates the cross entropy loss for a single row.
|
||||||
|
// TODO: pad in order to avoid shared memory bank conflicts.
|
||||||
|
|
||||||
|
// Find maximum for softmax:
|
||||||
|
float max = -INFINITY;
|
||||||
|
for (int i = 0; i < nclasses; ++i) {
|
||||||
|
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate log(softmax(logits)) which is just logits - max:
|
||||||
|
float sum = 0.0f;
|
||||||
|
for (int i = 0; i < nclasses; ++i) {
|
||||||
|
float val = tmp_logits[lane_id*nclasses + i] - max;
|
||||||
|
sum += expf(val);
|
||||||
|
tmp_logits[lane_id*nclasses + i] = val;
|
||||||
|
}
|
||||||
|
sum = logf(sum);
|
||||||
|
|
||||||
|
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
|
||||||
|
float loss = 0.0f;
|
||||||
|
for (int i = 0; i < nclasses; ++i) {
|
||||||
|
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
|
||||||
|
}
|
||||||
|
loss = -warp_reduce_sum(loss) / (float)k;
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (lane_id == 0) {
|
||||||
|
tmp_all[warp_id] = loss;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (warp_id != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
|
||||||
|
loss = warp_reduce_sum(loss);
|
||||||
|
|
||||||
|
if (lane_id != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[blockIdx.x] = loss;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
const float * src0_d = (const float *) src0->data;
|
||||||
|
const float * src1_d = (const float *) src1->data;
|
||||||
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
|
ggml_cuda_pool & pool = ctx.pool();
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||||
|
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
|
||||||
|
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
|
||||||
|
|
||||||
|
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
|
||||||
|
|
||||||
|
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||||
|
|
||||||
|
// Combine results from individual blocks:
|
||||||
|
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||||
|
}
|
5
ggml/src/ggml-cuda/cross-entropy-loss.cuh
Normal file
5
ggml/src/ggml-cuda/cross-entropy-loss.cuh
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
@ -22,6 +22,7 @@ typedef void (* fattn_kernel_t)(
|
|||||||
const float m0,
|
const float m0,
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
|
const float logit_softcap,
|
||||||
const int ne00,
|
const int ne00,
|
||||||
const int ne01,
|
const int ne01,
|
||||||
const int ne02,
|
const int ne02,
|
||||||
@ -657,11 +658,17 @@ void launch_fattn(
|
|||||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||||
const int shmem = 0;
|
const int shmem = 0;
|
||||||
|
|
||||||
float scale = 1.0f;
|
float scale = 1.0f;
|
||||||
float max_bias = 0.0f;
|
float max_bias = 0.0f;
|
||||||
|
float logit_softcap = 0.0f;
|
||||||
|
|
||||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||||
|
memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
|
if (logit_softcap != 0.0f) {
|
||||||
|
scale /= logit_softcap;
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t n_head = Q->ne[2];
|
const uint32_t n_head = Q->ne[2];
|
||||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||||
@ -675,7 +682,7 @@ void launch_fattn(
|
|||||||
V_data,
|
V_data,
|
||||||
mask ? ((const char *) mask->data) : nullptr,
|
mask ? ((const char *) mask->data) : nullptr,
|
||||||
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||||
scale, max_bias, m0, m1, n_head_log2,
|
scale, max_bias, m0, m1, n_head_log2, logit_softcap,
|
||||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||||
|
|
||||||
template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
|
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
@ -20,6 +20,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||||||
const float m0,
|
const float m0,
|
||||||
const float m1,
|
const float m1,
|
||||||
const uint32_t n_head_log2,
|
const uint32_t n_head_log2,
|
||||||
|
const float logit_softcap,
|
||||||
const int ne00,
|
const int ne00,
|
||||||
const int ne01,
|
const int ne01,
|
||||||
const int ne02,
|
const int ne02,
|
||||||
@ -44,6 +45,12 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||||||
const int ne2,
|
const int ne2,
|
||||||
const int ne3) {
|
const int ne3) {
|
||||||
#ifdef FP16_AVAILABLE
|
#ifdef FP16_AVAILABLE
|
||||||
|
// Skip unused kernel variants for faster compilation:
|
||||||
|
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
@ -154,7 +161,13 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||||||
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
|
||||||
const int j_KQ = j_KQ_0 + threadIdx.y;
|
const int j_KQ = j_KQ_0 + threadIdx.y;
|
||||||
|
|
||||||
half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
half sum;
|
||||||
|
if (use_logit_softcap) {
|
||||||
|
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||||
|
sum = logit_softcap * tanhf(tmp.x + tmp.y);
|
||||||
|
} else {
|
||||||
|
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
|
||||||
|
}
|
||||||
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||||
|
|
||||||
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
|
||||||
@ -270,20 +283,20 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||||||
#endif // FP16_AVAILABLE
|
#endif // FP16_AVAILABLE
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int cols_per_block, int parallel_blocks>
|
template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
|
||||||
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
switch (Q->ne[0]) {
|
switch (Q->ne[0]) {
|
||||||
case 64: {
|
case 64: {
|
||||||
constexpr int D = 64;
|
constexpr int D = 64;
|
||||||
constexpr int nwarps = 8;
|
constexpr int nwarps = 8;
|
||||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||||
} break;
|
} break;
|
||||||
case 128: {
|
case 128: {
|
||||||
constexpr int D = 128;
|
constexpr int D = 128;
|
||||||
constexpr int nwarps = 8;
|
constexpr int nwarps = 8;
|
||||||
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
|
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
|
||||||
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
@ -296,24 +309,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
|
|||||||
const ggml_tensor * KQV = dst;
|
const ggml_tensor * KQV = dst;
|
||||||
const ggml_tensor * Q = dst->src[0];
|
const ggml_tensor * Q = dst->src[0];
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[2];
|
const int32_t precision = KQV->op_params[3];
|
||||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||||
|
|
||||||
|
float logit_softcap;
|
||||||
|
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
|
||||||
|
|
||||||
if (Q->ne[1] <= 16) {
|
if (Q->ne[1] <= 16) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
if (logit_softcap == 0.0f) {
|
||||||
|
constexpr bool use_logit_softcap = false;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
} else {
|
||||||
|
constexpr bool use_logit_softcap = true;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Q->ne[1] <= 32) {
|
if (Q->ne[1] <= 32) {
|
||||||
constexpr int cols_per_block = 32;
|
constexpr int cols_per_block = 32;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
if (logit_softcap == 0.0f) {
|
||||||
|
constexpr bool use_logit_softcap = false;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
} else {
|
||||||
|
constexpr bool use_logit_softcap = true;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr int cols_per_block = 32;
|
constexpr int cols_per_block = 32;
|
||||||
constexpr int parallel_blocks = 1;
|
constexpr int parallel_blocks = 1;
|
||||||
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
|
if (logit_softcap == 0.0f) {
|
||||||
|
constexpr bool use_logit_softcap = false;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
} else {
|
||||||
|
constexpr bool use_logit_softcap = true;
|
||||||
|
launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user