mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-15 07:19:53 +00:00
Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
0e601cafe9
@ -1,18 +1,16 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
@ -24,13 +22,12 @@ WORKDIR /app
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
# Enable cURL
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
ENV LLAMA_CURL=1
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
|
cp build/bin/* .
|
||||||
RUN make -j$(nproc)
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
|||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git
|
apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
|
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
RUN make -j$(nproc) llama-cli
|
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
@ -8,31 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
|||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
# Enable cURL
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
ENV LLAMA_CURL=1
|
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -26,6 +26,8 @@ RUN apt-get update && \
|
|||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
|||||||
ENV GGML_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
|||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -21,6 +21,8 @@ RUN apt-get update && \
|
|||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
@ -1,13 +1,52 @@
|
|||||||
|
{ inputs, ... }:
|
||||||
|
|
||||||
{
|
{
|
||||||
perSystem =
|
perSystem =
|
||||||
{ config, lib, ... }:
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
system,
|
||||||
|
...
|
||||||
|
}:
|
||||||
{
|
{
|
||||||
devShells =
|
devShells =
|
||||||
lib.concatMapAttrs
|
let
|
||||||
(name: package: {
|
pkgs = import inputs.nixpkgs { inherit system; };
|
||||||
${name} = package.passthru.shell;
|
stdenv = pkgs.stdenv;
|
||||||
${name + "-extra"} = package.passthru.shell-extra;
|
scripts = config.packages.python-scripts;
|
||||||
})
|
in
|
||||||
config.packages;
|
lib.pipe (config.packages) [
|
||||||
|
(lib.concatMapAttrs (
|
||||||
|
name: package: {
|
||||||
|
${name} = pkgs.mkShell {
|
||||||
|
name = "${name}";
|
||||||
|
inputsFrom = [ package ];
|
||||||
|
shellHook = ''
|
||||||
|
echo "Entering ${name} devShell"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
"${name}-extra" =
|
||||||
|
if (name == "python-scripts") then
|
||||||
|
null
|
||||||
|
else
|
||||||
|
pkgs.mkShell {
|
||||||
|
name = "${name}-extra";
|
||||||
|
inputsFrom = [
|
||||||
|
package
|
||||||
|
scripts
|
||||||
|
];
|
||||||
|
# Extra packages that *may* be used by some scripts
|
||||||
|
packages = [
|
||||||
|
pkgs.python3Packages.tiktoken
|
||||||
|
];
|
||||||
|
shellHook = ''
|
||||||
|
echo "Entering ${name} devShell"
|
||||||
|
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
}
|
||||||
|
))
|
||||||
|
(lib.filterAttrs (name: value: value != null))
|
||||||
|
];
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -26,16 +26,14 @@
|
|||||||
config.cudaSupport = true;
|
config.cudaSupport = true;
|
||||||
config.allowUnfreePredicate =
|
config.allowUnfreePredicate =
|
||||||
p:
|
p:
|
||||||
builtins.all
|
builtins.all (
|
||||||
(
|
license:
|
||||||
license:
|
license.free
|
||||||
license.free
|
|| builtins.elem license.shortName [
|
||||||
|| builtins.elem license.shortName [
|
"CUDA EULA"
|
||||||
"CUDA EULA"
|
"cuDNN EULA"
|
||||||
"cuDNN EULA"
|
]
|
||||||
]
|
) (p.meta.licenses or [ p.meta.license ]);
|
||||||
)
|
|
||||||
(p.meta.licenses or [ p.meta.license ]);
|
|
||||||
};
|
};
|
||||||
# Ensure dependencies use ROCm consistently
|
# Ensure dependencies use ROCm consistently
|
||||||
pkgsRocm = import inputs.nixpkgs {
|
pkgsRocm = import inputs.nixpkgs {
|
||||||
|
36
.devops/nix/package-gguf-py.nix
Normal file
36
.devops/nix/package-gguf-py.nix
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
lib,
|
||||||
|
llamaVersion,
|
||||||
|
numpy,
|
||||||
|
tqdm,
|
||||||
|
sentencepiece,
|
||||||
|
pyyaml,
|
||||||
|
poetry-core,
|
||||||
|
buildPythonPackage,
|
||||||
|
pytestCheckHook,
|
||||||
|
}:
|
||||||
|
|
||||||
|
buildPythonPackage {
|
||||||
|
pname = "gguf";
|
||||||
|
version = llamaVersion;
|
||||||
|
pyproject = true;
|
||||||
|
nativeBuildInputs = [ poetry-core ];
|
||||||
|
propagatedBuildInputs = [
|
||||||
|
numpy
|
||||||
|
tqdm
|
||||||
|
sentencepiece
|
||||||
|
pyyaml
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../gguf-py;
|
||||||
|
pythonImportsCheck = [
|
||||||
|
"numpy"
|
||||||
|
"gguf"
|
||||||
|
];
|
||||||
|
nativeCheckInputs = [ pytestCheckHook ];
|
||||||
|
doCheck = true;
|
||||||
|
meta = with lib; {
|
||||||
|
description = "Python package for writing binary files in the GGUF format";
|
||||||
|
license = licenses.mit;
|
||||||
|
maintainers = [ maintainers.ditsuke ];
|
||||||
|
};
|
||||||
|
}
|
@ -3,31 +3,33 @@
|
|||||||
glibc,
|
glibc,
|
||||||
config,
|
config,
|
||||||
stdenv,
|
stdenv,
|
||||||
mkShell,
|
|
||||||
runCommand,
|
runCommand,
|
||||||
cmake,
|
cmake,
|
||||||
ninja,
|
ninja,
|
||||||
pkg-config,
|
pkg-config,
|
||||||
git,
|
git,
|
||||||
python3,
|
|
||||||
mpi,
|
mpi,
|
||||||
blas,
|
blas,
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
|
autoAddDriverRunpath,
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
vulkan-headers,
|
vulkan-headers,
|
||||||
vulkan-loader,
|
vulkan-loader,
|
||||||
curl,
|
curl,
|
||||||
shaderc,
|
shaderc,
|
||||||
useBlas ? builtins.all (x: !x) [
|
useBlas ?
|
||||||
useCuda
|
builtins.all (x: !x) [
|
||||||
useMetalKit
|
useCuda
|
||||||
useRocm
|
useMetalKit
|
||||||
useVulkan
|
useRocm
|
||||||
] && blas.meta.available,
|
useVulkan
|
||||||
|
]
|
||||||
|
&& blas.meta.available,
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
# Increases the runtime closure size by ~700M
|
||||||
|
useMpi ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
enableCurl ? true,
|
enableCurl ? true,
|
||||||
useVulkan ? false,
|
useVulkan ? false,
|
||||||
@ -37,8 +39,8 @@
|
|||||||
# otherwise we get libstdc++ errors downstream.
|
# otherwise we get libstdc++ errors downstream.
|
||||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||||
precompileMetalShaders ? false
|
precompileMetalShaders ? false,
|
||||||
}@inputs:
|
}:
|
||||||
|
|
||||||
let
|
let
|
||||||
inherit (lib)
|
inherit (lib)
|
||||||
@ -46,7 +48,6 @@ let
|
|||||||
cmakeFeature
|
cmakeFeature
|
||||||
optionals
|
optionals
|
||||||
strings
|
strings
|
||||||
versionOlder
|
|
||||||
;
|
;
|
||||||
|
|
||||||
stdenv = throw "Use effectiveStdenv instead";
|
stdenv = throw "Use effectiveStdenv instead";
|
||||||
@ -62,54 +63,11 @@ let
|
|||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
"-${strings.concatMapStringsSep "-" strings.toLower suffices}";
|
||||||
descriptionSuffix =
|
descriptionSuffix = strings.optionalString (
|
||||||
strings.optionalString (suffices != [ ])
|
suffices != [ ]
|
||||||
", accelerated with ${strings.concatStringsSep ", " suffices}";
|
) ", accelerated with ${strings.concatStringsSep ", " suffices}";
|
||||||
|
|
||||||
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
|
xcrunHost = runCommand "xcrunHost" { } ''
|
||||||
|
|
||||||
# TODO: package the Python in this repository in a Nix-like way.
|
|
||||||
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
|
|
||||||
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
|
|
||||||
# https://peps.python.org/pep-0517/
|
|
||||||
#
|
|
||||||
# TODO: Package up each Python script or service appropriately, by making
|
|
||||||
# them into "entrypoints"
|
|
||||||
llama-python = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
# TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
|
|
||||||
llama-python-extra = python3.withPackages (
|
|
||||||
ps: [
|
|
||||||
ps.numpy
|
|
||||||
ps.sentencepiece
|
|
||||||
ps.tiktoken
|
|
||||||
ps.torchWithoutCuda
|
|
||||||
ps.transformers
|
|
||||||
|
|
||||||
# server bench
|
|
||||||
ps.matplotlib
|
|
||||||
|
|
||||||
# server tests
|
|
||||||
ps.openai
|
|
||||||
ps.behave
|
|
||||||
ps.prometheus-client
|
|
||||||
|
|
||||||
# for examples/pydantic-models-to-grammar-examples.py
|
|
||||||
ps.docstring-parser
|
|
||||||
ps.pydantic
|
|
||||||
|
|
||||||
# for scripts/compare-llama-bench.py
|
|
||||||
ps.gitpython
|
|
||||||
ps.tabulate
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
xcrunHost = runCommand "xcrunHost" {} ''
|
|
||||||
mkdir -p $out/bin
|
mkdir -p $out/bin
|
||||||
ln -s /usr/bin/xcrun $out/bin
|
ln -s /usr/bin/xcrun $out/bin
|
||||||
'';
|
'';
|
||||||
@ -144,181 +102,145 @@ let
|
|||||||
];
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (
|
effectiveStdenv.mkDerivation (finalAttrs: {
|
||||||
finalAttrs: {
|
pname = "llama-cpp${pnameSuffix}";
|
||||||
pname = "llama-cpp${pnameSuffix}";
|
version = llamaVersion;
|
||||||
version = llamaVersion;
|
|
||||||
|
|
||||||
# Note: none of the files discarded here are visible in the sandbox or
|
# Note: none of the files discarded here are visible in the sandbox or
|
||||||
# affect the output hash. This also means they can be modified without
|
# affect the output hash. This also means they can be modified without
|
||||||
# triggering a rebuild.
|
# triggering a rebuild.
|
||||||
src = lib.cleanSourceWith {
|
src = lib.cleanSourceWith {
|
||||||
filter =
|
filter =
|
||||||
name: type:
|
name: type:
|
||||||
let
|
let
|
||||||
noneOf = builtins.all (x: !x);
|
noneOf = builtins.all (x: !x);
|
||||||
baseName = baseNameOf name;
|
baseName = baseNameOf name;
|
||||||
in
|
in
|
||||||
noneOf [
|
noneOf [
|
||||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||||
(baseName == "flake.lock")
|
(baseName == "flake.lock")
|
||||||
];
|
|
||||||
src = lib.cleanSource ../../.;
|
|
||||||
};
|
|
||||||
|
|
||||||
postPatch = ''
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
|
||||||
substituteInPlace ./ggml/src/ggml-metal.m \
|
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
|
||||||
'';
|
|
||||||
|
|
||||||
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
|
||||||
# `default.metallib` may be compiled with Metal compiler from XCode
|
|
||||||
# and we need to escape sandbox on MacOS to access Metal compiler.
|
|
||||||
# `xcrun` is used find the path of the Metal compiler, which is varible
|
|
||||||
# and not on $PATH
|
|
||||||
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
|
||||||
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
|
||||||
|
|
||||||
nativeBuildInputs =
|
|
||||||
[
|
|
||||||
cmake
|
|
||||||
ninja
|
|
||||||
pkg-config
|
|
||||||
git
|
|
||||||
]
|
|
||||||
++ optionals useCuda [
|
|
||||||
cudaPackages.cuda_nvcc
|
|
||||||
|
|
||||||
# TODO: Replace with autoAddDriverRunpath
|
|
||||||
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
|
|
||||||
cudaPackages.autoAddOpenGLRunpathHook
|
|
||||||
]
|
|
||||||
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
|
|
||||||
glibc.static
|
|
||||||
] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
|
|
||||||
xcrunHost
|
|
||||||
];
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
|
||||||
buildInputs =
|
postPatch = ''
|
||||||
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
++ optionals useCuda cudaBuildInputs
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
++ optionals useMpi [ mpi ]
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
++ optionals useRocm rocmBuildInputs
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
++ optionals useBlas [ blas ]
|
'';
|
||||||
++ optionals useVulkan vulkanBuildInputs
|
|
||||||
++ optionals enableCurl [ curl ];
|
|
||||||
|
|
||||||
cmakeFlags =
|
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
|
||||||
[
|
# `default.metallib` may be compiled with Metal compiler from XCode
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
# and we need to escape sandbox on MacOS to access Metal compiler.
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
# `xcrun` is used find the path of the Metal compiler, which is varible
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
# and not on $PATH
|
||||||
(cmakeBool "LLAMA_CURL" enableCurl)
|
# see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
|
||||||
(cmakeBool "GGML_NATIVE" false)
|
__noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
|
||||||
(cmakeBool "GGML_BLAS" useBlas)
|
|
||||||
(cmakeBool "GGML_CUDA" useCuda)
|
nativeBuildInputs =
|
||||||
(cmakeBool "GGML_HIPBLAS" useRocm)
|
[
|
||||||
(cmakeBool "GGML_METAL" useMetalKit)
|
cmake
|
||||||
(cmakeBool "GGML_VULKAN" useVulkan)
|
ninja
|
||||||
(cmakeBool "GGML_STATIC" enableStatic)
|
pkg-config
|
||||||
]
|
git
|
||||||
++ optionals useCuda [
|
]
|
||||||
(
|
++ optionals useCuda [
|
||||||
with cudaPackages.flags;
|
cudaPackages.cuda_nvcc
|
||||||
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
|
||||||
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
autoAddDriverRunpath
|
||||||
)
|
]
|
||||||
|
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
|
||||||
|
++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
|
||||||
|
|
||||||
|
buildInputs =
|
||||||
|
optionals effectiveStdenv.isDarwin darwinBuildInputs
|
||||||
|
++ optionals useCuda cudaBuildInputs
|
||||||
|
++ optionals useMpi [ mpi ]
|
||||||
|
++ optionals useRocm rocmBuildInputs
|
||||||
|
++ optionals useBlas [ blas ]
|
||||||
|
++ optionals useVulkan vulkanBuildInputs
|
||||||
|
++ optionals enableCurl [ curl ];
|
||||||
|
|
||||||
|
cmakeFlags =
|
||||||
|
[
|
||||||
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
|
(cmakeBool "LLAMA_CURL" enableCurl)
|
||||||
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
|
]
|
||||||
|
++ optionals useCuda [
|
||||||
|
(
|
||||||
|
with cudaPackages.flags;
|
||||||
|
cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
|
||||||
|
builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
|
||||||
)
|
)
|
||||||
]
|
)
|
||||||
++ optionals useRocm [
|
]
|
||||||
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
++ optionals useRocm [
|
||||||
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
(cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
|
||||||
]
|
(cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
|
||||||
++ optionals useMetalKit [
|
]
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
++ optionals useMetalKit [
|
||||||
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
];
|
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
|
];
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Environment variables needed for ROCm
|
||||||
env = optionals useRocm {
|
env = optionals useRocm {
|
||||||
ROCM_PATH = "${rocmPackages.clr}";
|
ROCM_PATH = "${rocmPackages.clr}";
|
||||||
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
|
||||||
};
|
};
|
||||||
|
|
||||||
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/include/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
meta = {
|
||||||
passthru = {
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
inherit
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
useBlas
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
useCuda
|
badPlatforms = optionals useCuda lib.platforms.darwin;
|
||||||
useMetalKit
|
|
||||||
useMpi
|
|
||||||
useRocm
|
|
||||||
useVulkan
|
|
||||||
;
|
|
||||||
|
|
||||||
shell = mkShell {
|
# Configurations that are known to result in build failures. Can be
|
||||||
name = "shell-${finalAttrs.finalPackage.name}";
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
description = "contains numpy and sentencepiece";
|
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
||||||
buildInputs = [ llama-python ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
shellHook = ''
|
|
||||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
|
|
||||||
shell-extra = mkShell {
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
name = "shell-extra-${finalAttrs.finalPackage.name}";
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
|
license = lib.licenses.mit;
|
||||||
buildInputs = [ llama-python-extra ];
|
|
||||||
inputsFrom = [ finalAttrs.finalPackage ];
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
meta = {
|
# Accommodates `nix run` and `lib.getExe`
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
mainProgram = "llama-cli";
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
|
||||||
badPlatforms = optionals useCuda lib.platforms.darwin;
|
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# These people might respond, on the best effort basis, if you ping them
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
# Consider adding yourself to this list if you want to ensure this flake
|
||||||
|
# stays maintained and you're willing to invest your time. Do not add
|
||||||
|
# other people without their consent. Consider removing people after
|
||||||
|
# they've been unreachable for long periods of time.
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
# an attrset following the same format as in
|
||||||
license = lib.licenses.mit;
|
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
||||||
|
maintainers = with lib.maintainers; [
|
||||||
|
philiptaron
|
||||||
|
SomeoneSerge
|
||||||
|
];
|
||||||
|
|
||||||
# Accommodates `nix run` and `lib.getExe`
|
# Extend `badPlatforms` instead
|
||||||
mainProgram = "llama-cli";
|
platforms = lib.platforms.all;
|
||||||
|
};
|
||||||
# These people might respond, on the best effort basis, if you ping them
|
})
|
||||||
# in case of Nix-specific regressions or for reviewing Nix-specific PRs.
|
|
||||||
# Consider adding yourself to this list if you want to ensure this flake
|
|
||||||
# stays maintained and you're willing to invest your time. Do not add
|
|
||||||
# other people without their consent. Consider removing people after
|
|
||||||
# they've been unreachable for long periods of time.
|
|
||||||
|
|
||||||
# Note that lib.maintainers is defined in Nixpkgs, but you may just add
|
|
||||||
# an attrset following the same format as in
|
|
||||||
# https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
|
|
||||||
maintainers = with lib.maintainers; [
|
|
||||||
philiptaron
|
|
||||||
SomeoneSerge
|
|
||||||
];
|
|
||||||
|
|
||||||
# Extend `badPlatforms` instead
|
|
||||||
platforms = lib.platforms.all;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
66
.devops/nix/python-scripts.nix
Normal file
66
.devops/nix/python-scripts.nix
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
{
|
||||||
|
lib,
|
||||||
|
stdenv,
|
||||||
|
buildPythonPackage,
|
||||||
|
poetry-core,
|
||||||
|
mkShell,
|
||||||
|
python3Packages,
|
||||||
|
gguf-py,
|
||||||
|
}@inputs:
|
||||||
|
|
||||||
|
let
|
||||||
|
llama-python-deps = with python3Packages; [
|
||||||
|
numpy
|
||||||
|
sentencepiece
|
||||||
|
transformers
|
||||||
|
protobuf
|
||||||
|
torchWithoutCuda
|
||||||
|
gguf-py
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# for scripts/compare-llama-bench.py
|
||||||
|
gitpython
|
||||||
|
tabulate
|
||||||
|
|
||||||
|
# for examples/pydantic-models-to-grammar-examples.py
|
||||||
|
docstring-parser
|
||||||
|
pydantic
|
||||||
|
|
||||||
|
];
|
||||||
|
|
||||||
|
llama-python-test-deps = with python3Packages; [
|
||||||
|
# Server bench
|
||||||
|
matplotlib
|
||||||
|
|
||||||
|
# server tests
|
||||||
|
openai
|
||||||
|
behave
|
||||||
|
prometheus-client
|
||||||
|
];
|
||||||
|
in
|
||||||
|
|
||||||
|
buildPythonPackage ({
|
||||||
|
pname = "llama-scripts";
|
||||||
|
version = "0.0.0";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
# NOTE: The files filtered out here are not visible in the build sandbox, neither
|
||||||
|
# do they affect the output hash. They can be modified without triggering a rebuild.
|
||||||
|
src = lib.cleanSourceWith {
|
||||||
|
filter =
|
||||||
|
name: type:
|
||||||
|
let
|
||||||
|
any = builtins.any (x: x);
|
||||||
|
baseName = builtins.baseNameOf name;
|
||||||
|
in
|
||||||
|
any [
|
||||||
|
(lib.hasSuffix ".py" name)
|
||||||
|
(baseName == "README.md")
|
||||||
|
(baseName == "pyproject.toml")
|
||||||
|
];
|
||||||
|
src = lib.cleanSource ../../.;
|
||||||
|
};
|
||||||
|
nativeBuildInputs = [ poetry-core ];
|
||||||
|
nativeCheckInputs = llama-python-test-deps;
|
||||||
|
dependencies = llama-python-deps;
|
||||||
|
})
|
@ -1,19 +1,41 @@
|
|||||||
{
|
{
|
||||||
lib,
|
lib,
|
||||||
newScope,
|
newScope,
|
||||||
|
python3,
|
||||||
llamaVersion ? "0.0.0",
|
llamaVersion ? "0.0.0",
|
||||||
}:
|
}:
|
||||||
|
|
||||||
|
let
|
||||||
|
pythonPackages = python3.pkgs;
|
||||||
|
buildPythonPackage = pythonPackages.buildPythonPackage;
|
||||||
|
numpy = pythonPackages.numpy;
|
||||||
|
tqdm = pythonPackages.tqdm;
|
||||||
|
sentencepiece = pythonPackages.sentencepiece;
|
||||||
|
pyyaml = pythonPackages.pyyaml;
|
||||||
|
poetry-core = pythonPackages.poetry-core;
|
||||||
|
pytestCheckHook = pythonPackages.pytestCheckHook;
|
||||||
|
in
|
||||||
|
|
||||||
# We're using `makeScope` instead of just writing out an attrset
|
# We're using `makeScope` instead of just writing out an attrset
|
||||||
# because it allows users to apply overlays later using `overrideScope'`.
|
# because it allows users to apply overlays later using `overrideScope'`.
|
||||||
# Cf. https://noogle.dev/f/lib/makeScope
|
# Cf. https://noogle.dev/f/lib/makeScope
|
||||||
|
|
||||||
lib.makeScope newScope (
|
lib.makeScope newScope (self: {
|
||||||
self: {
|
inherit llamaVersion;
|
||||||
inherit llamaVersion;
|
gguf-py = self.callPackage ./package-gguf-py.nix {
|
||||||
llama-cpp = self.callPackage ./package.nix { };
|
inherit
|
||||||
docker = self.callPackage ./docker.nix { };
|
buildPythonPackage
|
||||||
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
numpy
|
||||||
sif = self.callPackage ./sif.nix { };
|
tqdm
|
||||||
}
|
sentencepiece
|
||||||
)
|
poetry-core
|
||||||
|
pyyaml
|
||||||
|
pytestCheckHook
|
||||||
|
;
|
||||||
|
};
|
||||||
|
python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
|
||||||
|
llama-cpp = self.callPackage ./package.nix { };
|
||||||
|
docker = self.callPackage ./docker.nix { };
|
||||||
|
docker-min = self.callPackage ./docker.nix { interactive = false; };
|
||||||
|
sif = self.callPackage ./sif.nix { };
|
||||||
|
})
|
||||||
|
2
.ecrc
2
.ecrc
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$"],
|
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
@ -23,6 +23,9 @@ env:
|
|||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GGML_NLOOP: 3
|
GGML_NLOOP: 3
|
||||||
GGML_N_THREADS: 1
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
macOS-latest-cmake-arm64:
|
macOS-latest-cmake-arm64:
|
||||||
@ -375,7 +378,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
@ -401,7 +404,7 @@ jobs:
|
|||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: add oneAPI to apt
|
- name: add oneAPI to apt
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -442,7 +445,7 @@ jobs:
|
|||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: add oneAPI to apt
|
- name: add oneAPI to apt
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -546,7 +549,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
@ -576,7 +579,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
@ -610,7 +613,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v1
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Dependencies
|
- name: Dependencies
|
||||||
id: depends
|
id: depends
|
||||||
@ -857,7 +860,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
@ -969,14 +972,14 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
id: checkout
|
id: checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Install
|
- name: Install
|
||||||
id: depends
|
id: depends
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "Downloading AMD HIP SDK Installer"
|
write-host "Downloading AMD HIP SDK Installer"
|
||||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||||
write-host "Installing AMD HIP SDK"
|
write-host "Installing AMD HIP SDK"
|
||||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||||
write-host "Completed AMD HIP SDK installation"
|
write-host "Completed AMD HIP SDK installation"
|
||||||
|
21
.github/workflows/docker.yml
vendored
21
.github/workflows/docker.yml
vendored
@ -37,9 +37,9 @@ jobs:
|
|||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
@ -96,21 +96,12 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
- name: Build and push Docker image (tagged + versioned)
|
||||||
if: github.event_name == 'push'
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged)
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: ${{ github.event_name == 'push' }}
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
7
.github/workflows/server.yml
vendored
7
.github/workflows/server.yml
vendored
@ -20,6 +20,12 @@ on:
|
|||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
|
|
||||||
|
env:
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
LLAMA_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
@ -173,6 +179,7 @@ jobs:
|
|||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
|
$env:PYTHONIOENCODING = ":replace"
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -61,6 +61,7 @@ llama-batched-swift
|
|||||||
/rpc-server
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
autogen-*.md
|
||||||
|
|
||||||
# Deprecated
|
# Deprecated
|
||||||
|
|
||||||
|
@ -82,11 +82,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
|||||||
|
|
||||||
# change the default for these ggml options
|
# change the default for these ggml options
|
||||||
if (NOT DEFINED GGML_LLAMAFILE)
|
if (NOT DEFINED GGML_LLAMAFILE)
|
||||||
set(GGML_LLAMAFILE ON)
|
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||||
set(GGML_CUDA_USE_GRAPHS ON)
|
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# transition helpers
|
# transition helpers
|
||||||
@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
|
|||||||
# determining _precisely_ which defines are necessary for the llama-config
|
# determining _precisely_ which defines are necessary for the llama-config
|
||||||
# package.
|
# package.
|
||||||
#
|
#
|
||||||
|
set(GGML_TRANSIENT_DEFINES)
|
||||||
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||||
|
if (GGML_DIR_DEFINES)
|
||||||
|
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
|
||||||
|
endif()
|
||||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
if (GGML_TARGET_DEFINES)
|
||||||
|
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||||
|
endif()
|
||||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||||
|
|
||||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||||
|
@ -28,11 +28,12 @@
|
|||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
|
{ "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||||
}
|
}
|
||||||
@ -40,8 +41,8 @@
|
|||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-llvm", "hidden": true,
|
"name": "arm64-windows-llvm", "hidden": true,
|
||||||
"architecture": { "value": "arm64", "strategy": "external" },
|
"architecture": { "value": "arm64", "strategy": "external" },
|
||||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
"toolset": { "value": "host=x64", "strategy": "external" },
|
||||||
"cacheVariables": {
|
"cacheVariables": {
|
||||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||||
}
|
}
|
||||||
@ -60,6 +61,8 @@
|
|||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
{ "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
74
Makefile
74
Makefile
@ -39,10 +39,12 @@ BUILD_TARGETS = \
|
|||||||
llama-tokenize \
|
llama-tokenize \
|
||||||
llama-vdot \
|
llama-vdot \
|
||||||
llama-cvector-generator \
|
llama-cvector-generator \
|
||||||
|
llama-gen-docs \
|
||||||
tests/test-c.o
|
tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
tests/test-arg-parser \
|
||||||
tests/test-autorelease \
|
tests/test-autorelease \
|
||||||
tests/test-backend-ops \
|
tests/test-backend-ops \
|
||||||
tests/test-chat-template \
|
tests/test-chat-template \
|
||||||
@ -52,6 +54,7 @@ TEST_TARGETS = \
|
|||||||
tests/test-grammar-parser \
|
tests/test-grammar-parser \
|
||||||
tests/test-json-schema-to-grammar \
|
tests/test-json-schema-to-grammar \
|
||||||
tests/test-llama-grammar \
|
tests/test-llama-grammar \
|
||||||
|
tests/test-log \
|
||||||
tests/test-model-load-cancel \
|
tests/test-model-load-cancel \
|
||||||
tests/test-opt \
|
tests/test-opt \
|
||||||
tests/test-quantize-fns \
|
tests/test-quantize-fns \
|
||||||
@ -146,6 +149,14 @@ GGML_NO_METAL := 1
|
|||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_DISABLE_LOGS
|
||||||
|
REMOVE_WARNING := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SERVER_VERBOSE
|
||||||
|
REMOVE_WARNING := 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
endif
|
endif
|
||||||
@ -349,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
|
|||||||
MK_LDFLAGS += -fsanitize=undefined -g
|
MK_LDFLAGS += -fsanitize=undefined -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_VERBOSE
|
|
||||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_SSL
|
ifdef LLAMA_SERVER_SSL
|
||||||
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
MK_LDFLAGS += -lssl -lcrypto
|
MK_LDFLAGS += -lssl -lcrypto
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DISABLE_LOGS
|
|
||||||
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
|
|
||||||
endif # LLAMA_DISABLE_LOGS
|
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
WARN_FLAGS = \
|
WARN_FLAGS = \
|
||||||
-Wall \
|
-Wall \
|
||||||
@ -432,7 +435,7 @@ endif
|
|||||||
# TODO: probably these flags need to be tweaked on some architectures
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
|
|
||||||
ifndef RISCV
|
ifndef RISCV_CROSS_COMPILE
|
||||||
|
|
||||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||||
# Use all CPU extensions that are available:
|
# Use all CPU extensions that are available:
|
||||||
@ -512,7 +515,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
|||||||
MK_CXXFLAGS += -mlasx
|
MK_CXXFLAGS += -mlasx
|
||||||
endif
|
endif
|
||||||
|
|
||||||
else
|
ifneq ($(filter riscv64%,$(UNAME_M)),)
|
||||||
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
|
endif
|
||||||
|
|
||||||
|
else # RISC-V CROSS COMPILATION
|
||||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
endif
|
endif
|
||||||
@ -611,7 +619,7 @@ ifdef GGML_CUDA
|
|||||||
CUDA_PATH ?= /usr/local/cuda
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
endif
|
endif
|
||||||
|
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||||
MK_NVCCFLAGS += -use_fast_math
|
MK_NVCCFLAGS += -use_fast_math
|
||||||
endif # GGML_MUSA
|
endif # GGML_MUSA
|
||||||
@ -923,11 +931,12 @@ OBJ_LLAMA = \
|
|||||||
|
|
||||||
OBJ_COMMON = \
|
OBJ_COMMON = \
|
||||||
common/common.o \
|
common/common.o \
|
||||||
|
common/arg.o \
|
||||||
|
common/log.o \
|
||||||
common/console.o \
|
common/console.o \
|
||||||
common/ngram-cache.o \
|
common/ngram-cache.o \
|
||||||
common/sampling.o \
|
common/sampling.o \
|
||||||
common/train.o \
|
common/train.o \
|
||||||
common/grammar-parser.o \
|
|
||||||
common/build-info.o \
|
common/build-info.o \
|
||||||
common/json-schema-to-grammar.o
|
common/json-schema-to-grammar.o
|
||||||
|
|
||||||
@ -1020,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE)
|
|||||||
$(info )
|
$(info )
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef REMOVE_WARNING
|
||||||
|
$(info !!! REMOVAL WARNING !!!)
|
||||||
|
$(info The following LLAMA_ options have been removed and are no longer supported)
|
||||||
|
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||||
|
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||||
|
$(info )
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build libraries
|
# Build libraries
|
||||||
#
|
#
|
||||||
@ -1156,6 +1173,16 @@ common/common.o: \
|
|||||||
include/llama.h
|
include/llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
common/arg.o: \
|
||||||
|
common/arg.cpp \
|
||||||
|
common/arg.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
common/log.o: \
|
||||||
|
common/log.cpp \
|
||||||
|
common/log.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/sampling.o: \
|
common/sampling.o: \
|
||||||
common/sampling.cpp \
|
common/sampling.cpp \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
@ -1167,11 +1194,6 @@ common/console.o: \
|
|||||||
common/console.h
|
common/console.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/grammar-parser.o: \
|
|
||||||
common/grammar-parser.cpp \
|
|
||||||
common/grammar-parser.h
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
||||||
|
|
||||||
common/json-schema-to-grammar.o: \
|
common/json-schema-to-grammar.o: \
|
||||||
common/json-schema-to-grammar.cpp \
|
common/json-schema-to-grammar.cpp \
|
||||||
common/json-schema-to-grammar.h
|
common/json-schema-to-grammar.h
|
||||||
@ -1339,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||||
$(OBJ_GGML) $(OBJ_LLAMA)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
@ -1433,6 +1455,7 @@ llama-server: \
|
|||||||
examples/server/system-prompts.js.hpp \
|
examples/server/system-prompts.js.hpp \
|
||||||
examples/server/prompt-formats.js.hpp \
|
examples/server/prompt-formats.js.hpp \
|
||||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||||
|
examples/server/loading.html.hpp \
|
||||||
common/json.hpp \
|
common/json.hpp \
|
||||||
common/stb_image.h \
|
common/stb_image.h \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
@ -1448,6 +1471,11 @@ examples/server/%.hpp: examples/server/public/% Makefile
|
|||||||
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
||||||
) > $@
|
) > $@
|
||||||
|
|
||||||
|
llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
libllava.a: examples/llava/llava.cpp \
|
libllava.a: examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
examples/llava/clip.cpp \
|
examples/llava/clip.cpp \
|
||||||
@ -1505,11 +1533,21 @@ run-benchmark-matmult: llama-benchmark-matmult
|
|||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
.PHONY: run-benchmark-matmult swift
|
||||||
|
|
||||||
|
tests/test-arg-parser: tests/test-arg-parser.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-log: tests/test-log.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
28
README.md
28
README.md
@ -10,32 +10,14 @@
|
|||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
|
||||||
|
|
||||||
## Recent API changes
|
## Recent API changes
|
||||||
|
|
||||||
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
|
- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
|
||||||
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
|
||||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
|
||||||
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
|
|
||||||
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
|
|
||||||
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
|
|
||||||
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- **`convert.py` has been deprecated and moved to `examples/convert_legacy_llama.py`, please use `convert_hf_to_gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||||
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
||||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
|
||||||
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
|
||||||
- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
|
|
||||||
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
|
|
||||||
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
|
|
||||||
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
|
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
@ -95,6 +77,7 @@ Typically finetunes of the base models below are supported as well.
|
|||||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||||
- [x] [OLMo](https://allenai.org/olmo)
|
- [x] [OLMo](https://allenai.org/olmo)
|
||||||
|
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
||||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
||||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
|
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
|
||||||
@ -107,6 +90,7 @@ Typically finetunes of the base models below are supported as well.
|
|||||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||||
|
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||||
|
|
||||||
@ -181,6 +165,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
@ -189,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||||
|
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
||||||
|
|
||||||
**Infrastructure:**
|
**Infrastructure:**
|
||||||
|
|
||||||
|
32
ci/run.sh
32
ci/run.sh
@ -13,6 +13,9 @@
|
|||||||
# # with SYCL support
|
# # with SYCL support
|
||||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
#
|
#
|
||||||
|
# # with VULKAN support
|
||||||
|
# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||||
|
#
|
||||||
|
|
||||||
if [ -z "$2" ]; then
|
if [ -z "$2" ]; then
|
||||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||||
@ -40,7 +43,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
@ -52,6 +55,10 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
|||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -z ${GG_BUILD_VULKAN} ]; then
|
||||||
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
|
||||||
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
# download a file if it does not exist or if it is outdated
|
# download a file if it does not exist or if it is outdated
|
||||||
@ -107,7 +114,7 @@ function gg_run_ctest_debug {
|
|||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
|
|
||||||
@ -138,7 +145,7 @@ function gg_run_ctest_release {
|
|||||||
gg_check_build_requirements
|
gg_check_build_requirements
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||||
@ -266,7 +273,6 @@ function gg_sum_ctest_with_model_release {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# open_llama_7b_v2
|
# open_llama_7b_v2
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_open_llama_7b_v2 {
|
function gg_run_open_llama_7b_v2 {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
@ -290,8 +296,8 @@ function gg_run_open_llama_7b_v2 {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -425,7 +431,7 @@ function gg_run_pythia_1_4b {
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -535,7 +541,6 @@ function gg_sum_pythia_1_4b {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# pythia_2_8b
|
# pythia_2_8b
|
||||||
# requires: GG_BUILD_CUDA
|
|
||||||
|
|
||||||
function gg_run_pythia_2_8b {
|
function gg_run_pythia_2_8b {
|
||||||
cd ${SRC}
|
cd ${SRC}
|
||||||
@ -556,8 +561,8 @@ function gg_run_pythia_2_8b {
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -692,7 +697,7 @@ function gg_run_embd_bge_small {
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
@ -732,6 +737,9 @@ function gg_sum_embd_bge_small {
|
|||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
|
export LLAMA_LOG_PREFIX=1
|
||||||
|
export LLAMA_LOG_TIMESTAMPS=1
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
@ -761,7 +769,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
|
||||||
test $ret -eq 0 && gg_run pythia_1_4b
|
test $ret -eq 0 && gg_run pythia_1_4b
|
||||||
else
|
else
|
||||||
test $ret -eq 0 && gg_run pythia_2_8b
|
test $ret -eq 0 && gg_run pythia_2_8b
|
||||||
|
@ -51,21 +51,23 @@ endif()
|
|||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} STATIC
|
add_library(${TARGET} STATIC
|
||||||
|
arg.cpp
|
||||||
|
arg.h
|
||||||
base64.hpp
|
base64.hpp
|
||||||
common.h
|
|
||||||
common.cpp
|
common.cpp
|
||||||
sampling.h
|
common.h
|
||||||
sampling.cpp
|
|
||||||
console.h
|
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
console.h
|
||||||
grammar-parser.cpp
|
|
||||||
json.hpp
|
|
||||||
json-schema-to-grammar.cpp
|
json-schema-to-grammar.cpp
|
||||||
train.h
|
json.hpp
|
||||||
train.cpp
|
log.cpp
|
||||||
ngram-cache.h
|
log.h
|
||||||
ngram-cache.cpp
|
ngram-cache.cpp
|
||||||
|
ngram-cache.h
|
||||||
|
sampling.cpp
|
||||||
|
sampling.h
|
||||||
|
train.cpp
|
||||||
|
train.h
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
1994
common/arg.cpp
Normal file
1994
common/arg.cpp
Normal file
File diff suppressed because it is too large
Load Diff
77
common/arg.h
Normal file
77
common/arg.h
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
//
|
||||||
|
// CLI argument parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_arg {
|
||||||
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::vector<const char *> args;
|
||||||
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
const char * env = nullptr;
|
||||||
|
std::string help;
|
||||||
|
bool is_sparam = false; // is current arg a sampling param?
|
||||||
|
void (*handler_void) (gpt_params & params) = nullptr;
|
||||||
|
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||||
|
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
|
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, int)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params)
|
||||||
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
// support 2 values for arg
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const char * value_hint_2,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
|
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
|
llama_arg & set_env(const char * env);
|
||||||
|
llama_arg & set_sparam();
|
||||||
|
bool in_example(enum llama_example ex);
|
||||||
|
bool get_value_from_env(std::string & output);
|
||||||
|
bool has_value_from_env();
|
||||||
|
std::string to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params_context {
|
||||||
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
|
gpt_params & params;
|
||||||
|
std::vector<llama_arg> options;
|
||||||
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
|
gpt_params_context(gpt_params & params) : params(params) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// parse input arguments from CLI
|
||||||
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
|
// function to be used by test-arg-parser
|
||||||
|
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
1982
common/common.cpp
1982
common/common.cpp
File diff suppressed because it is too large
Load Diff
190
common/common.h
190
common/common.h
@ -4,18 +4,9 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "sampling.h"
|
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
#include <sstream>
|
||||||
#include <thread>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <tuple>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
@ -54,26 +45,103 @@ struct llama_control_vector_load_info;
|
|||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct cpu_params {
|
||||||
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
int32_t cpu_get_num_physical_cores();
|
int32_t cpu_get_num_physical_cores();
|
||||||
int32_t cpu_get_num_math();
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// Common params
|
||||||
//
|
//
|
||||||
|
|
||||||
|
enum llama_example {
|
||||||
|
LLAMA_EXAMPLE_COMMON,
|
||||||
|
LLAMA_EXAMPLE_SPECULATIVE,
|
||||||
|
LLAMA_EXAMPLE_MAIN,
|
||||||
|
LLAMA_EXAMPLE_INFILL,
|
||||||
|
LLAMA_EXAMPLE_EMBEDDING,
|
||||||
|
LLAMA_EXAMPLE_PERPLEXITY,
|
||||||
|
LLAMA_EXAMPLE_RETRIEVAL,
|
||||||
|
LLAMA_EXAMPLE_PASSKEY,
|
||||||
|
LLAMA_EXAMPLE_IMATRIX,
|
||||||
|
LLAMA_EXAMPLE_BENCH,
|
||||||
|
LLAMA_EXAMPLE_SERVER,
|
||||||
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
|
LLAMA_EXAMPLE_LLAVA,
|
||||||
|
LLAMA_EXAMPLE_LOOKUP,
|
||||||
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
|
|
||||||
|
LLAMA_EXAMPLE_COUNT,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum gpt_sampler_type {
|
||||||
|
GPT_SAMPLER_TYPE_NONE = 0,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
enum dimre_method {
|
enum dimre_method {
|
||||||
DIMRE_METHOD_PCA,
|
DIMRE_METHOD_PCA,
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
// sampler parameters
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
struct gpt_sampler_params {
|
||||||
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
|
|
||||||
int32_t n_threads = cpu_get_num_math();
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
int32_t n_threads_draft = -1;
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
int32_t n_threads_batch_draft = -1;
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
bool ignore_eos = false;
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
|
|
||||||
|
// print the parameters into a string
|
||||||
|
std::string print() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
@ -100,6 +168,11 @@ struct gpt_params {
|
|||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
struct cpu_params draft_cpuparams;
|
||||||
|
struct cpu_params draft_cpuparams_batch;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
@ -110,26 +183,25 @@ struct gpt_params {
|
|||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
// // sampling parameters
|
struct gpt_sampler_params sparams;
|
||||||
struct llama_sampling_params sparams;
|
|
||||||
|
|
||||||
std::string model = ""; // model path
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
std::string model_url = ""; // model url to download
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
std::string hf_token = ""; // HF token
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
std::string hf_repo = ""; // HF repo
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
std::string hf_file = ""; // HF file
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
std::string prompt = "";
|
std::string prompt = ""; // NOLINT
|
||||||
std::string prompt_file = ""; // store the external prompt file name
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
std::string logits_file = ""; // file for saving *all* logits
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
@ -173,15 +245,15 @@ struct gpt_params {
|
|||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
|
bool no_perf = false; // disable performance metrics
|
||||||
|
bool ctx_shift = true; // context shift on inifinite text generation
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool ignore_eos = false; // ignore generated EOS tokens
|
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
bool infill = false; // use infill mode
|
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
@ -191,7 +263,7 @@ struct gpt_params {
|
|||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
@ -204,18 +276,18 @@ struct gpt_params {
|
|||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = "";
|
std::string public_path = ""; // NOLINT
|
||||||
std::string chat_template = "";
|
std::string chat_template = ""; // NOLINT
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = ""; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string ssl_file_key = "";
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
std::string ssl_file_cert = "";
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
bool endpoint_slots = true;
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
@ -265,18 +337,22 @@ struct gpt_params {
|
|||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_parse_from_env(gpt_params & params);
|
// call once at the start of a program if it uses libcommon
|
||||||
void gpt_params_handle_model_default(gpt_params & params);
|
// initializes the logging system and prints info about the build
|
||||||
|
void gpt_init();
|
||||||
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
|
||||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
|
||||||
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
|
||||||
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
@ -305,6 +381,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
std::string string_from(bool value);
|
||||||
|
std::string string_from(const std::vector<int> & values);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
@ -327,8 +408,9 @@ struct llama_init_result {
|
|||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
@ -1,539 +0,0 @@
|
|||||||
#include "grammar-parser.h"
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cwchar>
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <exception>
|
|
||||||
|
|
||||||
namespace grammar_parser {
|
|
||||||
// NOTE: assumes valid utf8 (but checks for overrun)
|
|
||||||
// copied from llama.cpp
|
|
||||||
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
|
|
||||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
|
||||||
uint8_t first_byte = static_cast<uint8_t>(*src);
|
|
||||||
uint8_t highbits = first_byte >> 4;
|
|
||||||
int len = lookup[highbits];
|
|
||||||
uint8_t mask = (1 << (8 - len)) - 1;
|
|
||||||
uint32_t value = first_byte & mask;
|
|
||||||
const char * end = src + len; // may overrun!
|
|
||||||
const char * pos = src + 1;
|
|
||||||
for ( ; pos < end && *pos; pos++) {
|
|
||||||
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
|
|
||||||
}
|
|
||||||
return std::make_pair(value, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
|
||||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
||||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
|
||||||
return result.first->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
|
|
||||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
|
||||||
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
|
|
||||||
return next_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void add_rule(
|
|
||||||
parse_state & state,
|
|
||||||
uint32_t rule_id,
|
|
||||||
const std::vector<llama_grammar_element> & rule) {
|
|
||||||
if (state.rules.size() <= rule_id) {
|
|
||||||
state.rules.resize(rule_id + 1);
|
|
||||||
}
|
|
||||||
state.rules[rule_id] = rule;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_digit_char(char c) {
|
|
||||||
return '0' <= c && c <= '9';
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_word_char(char c) {
|
|
||||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
|
||||||
const char * pos = src;
|
|
||||||
const char * end = src + size;
|
|
||||||
uint32_t value = 0;
|
|
||||||
for ( ; pos < end && *pos; pos++) {
|
|
||||||
value <<= 4;
|
|
||||||
char c = *pos;
|
|
||||||
if ('a' <= c && c <= 'f') {
|
|
||||||
value += c - 'a' + 10;
|
|
||||||
} else if ('A' <= c && c <= 'F') {
|
|
||||||
value += c - 'A' + 10;
|
|
||||||
} else if ('0' <= c && c <= '9') {
|
|
||||||
value += c - '0';
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (pos != end) {
|
|
||||||
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
|
|
||||||
}
|
|
||||||
return std::make_pair(value, pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_space(const char * src, bool newline_ok) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
|
|
||||||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
|
|
||||||
if (*pos == '#') {
|
|
||||||
while (*pos && *pos != '\r' && *pos != '\n') {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_name(const char * src) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (is_word_char(*pos)) {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
if (pos == src) {
|
|
||||||
throw std::runtime_error(std::string("expecting name at ") + src);
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_int(const char * src) {
|
|
||||||
const char * pos = src;
|
|
||||||
while (is_digit_char(*pos)) {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
if (pos == src) {
|
|
||||||
throw std::runtime_error(std::string("expecting integer at ") + src);
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<uint32_t, const char *> parse_char(const char * src) {
|
|
||||||
if (*src == '\\') {
|
|
||||||
switch (src[1]) {
|
|
||||||
case 'x': return parse_hex(src + 2, 2);
|
|
||||||
case 'u': return parse_hex(src + 2, 4);
|
|
||||||
case 'U': return parse_hex(src + 2, 8);
|
|
||||||
case 't': return std::make_pair('\t', src + 2);
|
|
||||||
case 'r': return std::make_pair('\r', src + 2);
|
|
||||||
case 'n': return std::make_pair('\n', src + 2);
|
|
||||||
case '\\':
|
|
||||||
case '"':
|
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
return std::make_pair(src[1], src + 2);
|
|
||||||
default:
|
|
||||||
throw std::runtime_error(std::string("unknown escape at ") + src);
|
|
||||||
}
|
|
||||||
} else if (*src) {
|
|
||||||
return decode_utf8(src);
|
|
||||||
}
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * parse_alternates(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
uint32_t rule_id,
|
|
||||||
bool is_nested);
|
|
||||||
|
|
||||||
static const char * parse_sequence(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
std::vector<llama_grammar_element> & out_elements,
|
|
||||||
bool is_nested) {
|
|
||||||
size_t last_sym_start = out_elements.size();
|
|
||||||
const char * pos = src;
|
|
||||||
|
|
||||||
auto handle_repetitions = [&](int min_times, int max_times) {
|
|
||||||
|
|
||||||
if (last_sym_start == out_elements.size()) {
|
|
||||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
|
||||||
// the following rewrite rules:
|
|
||||||
// S{m,n} --> S S S (m times) S'(n-m)
|
|
||||||
// S'(x) ::= S S'(x-1) |
|
|
||||||
// (... n-m definitions of these S' rules ...)
|
|
||||||
// S'(1) ::= S |
|
|
||||||
// S{m,} --> S S S (m times) S'
|
|
||||||
// S' ::= S S' |
|
|
||||||
// S* --> S{0,}
|
|
||||||
// --> S' ::= S S' |
|
|
||||||
// S+ --> S{1,}
|
|
||||||
// --> S S'
|
|
||||||
// S' ::= S S' |
|
|
||||||
// S? --> S{0,1}
|
|
||||||
// --> S'
|
|
||||||
// S' ::= S |
|
|
||||||
|
|
||||||
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
|
|
||||||
if (min_times == 0) {
|
|
||||||
out_elements.resize(last_sym_start);
|
|
||||||
} else {
|
|
||||||
// Repeat the previous elements (min_times - 1) times
|
|
||||||
for (int i = 1; i < min_times; i++) {
|
|
||||||
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t last_rec_rule_id = 0;
|
|
||||||
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
|
|
||||||
|
|
||||||
std::vector<llama_grammar_element> rec_rule(previous_elements);
|
|
||||||
for (int i = 0; i < n_opt; i++) {
|
|
||||||
rec_rule.resize(previous_elements.size());
|
|
||||||
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
if (i > 0 || max_times < 0) {
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
|
|
||||||
}
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, rec_rule_id, rec_rule);
|
|
||||||
last_rec_rule_id = rec_rule_id;
|
|
||||||
}
|
|
||||||
if (n_opt > 0) {
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
while (*pos) {
|
|
||||||
if (*pos == '"') { // literal string
|
|
||||||
pos++;
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
while (*pos != '"') {
|
|
||||||
if (!*pos) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto char_pair = parse_char(pos);
|
|
||||||
pos = char_pair.second;
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '[') { // char range(s)
|
|
||||||
pos++;
|
|
||||||
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
|
|
||||||
if (*pos == '^') {
|
|
||||||
pos++;
|
|
||||||
start_type = LLAMA_GRETYPE_CHAR_NOT;
|
|
||||||
}
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
while (*pos != ']') {
|
|
||||||
if (!*pos) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto char_pair = parse_char(pos);
|
|
||||||
pos = char_pair.second;
|
|
||||||
enum llama_gretype type = last_sym_start < out_elements.size()
|
|
||||||
? LLAMA_GRETYPE_CHAR_ALT
|
|
||||||
: start_type;
|
|
||||||
|
|
||||||
out_elements.push_back({type, char_pair.first});
|
|
||||||
if (pos[0] == '-' && pos[1] != ']') {
|
|
||||||
if (!pos[1]) {
|
|
||||||
throw std::runtime_error("unexpected end of input");
|
|
||||||
}
|
|
||||||
auto endchar_pair = parse_char(pos + 1);
|
|
||||||
pos = endchar_pair.second;
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (is_word_char(*pos)) { // rule reference
|
|
||||||
const char * name_end = parse_name(pos);
|
|
||||||
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
|
|
||||||
pos = parse_space(name_end, is_nested);
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
|
|
||||||
} else if (*pos == '(') { // grouping
|
|
||||||
// parse nested alternates into synthesized rule
|
|
||||||
pos = parse_space(pos + 1, true);
|
|
||||||
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
|
|
||||||
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
// output reference to synthesized rule
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
|
|
||||||
if (*pos != ')') {
|
|
||||||
throw std::runtime_error(std::string("expecting ')' at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '.') { // any char
|
|
||||||
last_sym_start = out_elements.size();
|
|
||||||
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == '*') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(0, -1);
|
|
||||||
} else if (*pos == '+') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(1, -1);
|
|
||||||
} else if (*pos == '?') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
handle_repetitions(0, 1);
|
|
||||||
} else if (*pos == '{') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
|
|
||||||
if (!is_digit_char(*pos)) {
|
|
||||||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
|
||||||
}
|
|
||||||
const char * int_end = parse_int(pos);
|
|
||||||
int min_times = std::stoul(std::string(pos, int_end - pos));
|
|
||||||
pos = parse_space(int_end, is_nested);
|
|
||||||
|
|
||||||
int max_times = -1;
|
|
||||||
|
|
||||||
if (*pos == '}') {
|
|
||||||
max_times = min_times;
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else if (*pos == ',') {
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
|
|
||||||
if (is_digit_char(*pos)) {
|
|
||||||
const char * int_end = parse_int(pos);
|
|
||||||
max_times = std::stoul(std::string(pos, int_end - pos));
|
|
||||||
pos = parse_space(int_end, is_nested);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*pos != '}') {
|
|
||||||
throw std::runtime_error(std::string("expecting '}' at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 1, is_nested);
|
|
||||||
} else {
|
|
||||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
|
||||||
}
|
|
||||||
handle_repetitions(min_times, max_times);
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * parse_alternates(
|
|
||||||
parse_state & state,
|
|
||||||
const char * src,
|
|
||||||
const std::string & rule_name,
|
|
||||||
uint32_t rule_id,
|
|
||||||
bool is_nested) {
|
|
||||||
std::vector<llama_grammar_element> rule;
|
|
||||||
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
|
|
||||||
while (*pos == '|') {
|
|
||||||
rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
||||||
pos = parse_space(pos + 1, true);
|
|
||||||
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
|
|
||||||
}
|
|
||||||
rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
||||||
add_rule(state, rule_id, rule);
|
|
||||||
return pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * parse_rule(parse_state & state, const char * src) {
|
|
||||||
const char * name_end = parse_name(src);
|
|
||||||
const char * pos = parse_space(name_end, false);
|
|
||||||
size_t name_len = name_end - src;
|
|
||||||
uint32_t rule_id = get_symbol_id(state, src, name_len);
|
|
||||||
const std::string name(src, name_len);
|
|
||||||
|
|
||||||
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
|
|
||||||
throw std::runtime_error(std::string("expecting ::= at ") + pos);
|
|
||||||
}
|
|
||||||
pos = parse_space(pos + 3, true);
|
|
||||||
|
|
||||||
pos = parse_alternates(state, pos, name, rule_id, false);
|
|
||||||
|
|
||||||
if (*pos == '\r') {
|
|
||||||
pos += pos[1] == '\n' ? 2 : 1;
|
|
||||||
} else if (*pos == '\n') {
|
|
||||||
pos++;
|
|
||||||
} else if (*pos) {
|
|
||||||
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
|
|
||||||
}
|
|
||||||
return parse_space(pos, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
parse_state parse(const char * src) {
|
|
||||||
try {
|
|
||||||
parse_state state;
|
|
||||||
const char * pos = parse_space(src, true);
|
|
||||||
while (*pos) {
|
|
||||||
pos = parse_rule(state, pos);
|
|
||||||
}
|
|
||||||
// Validate the state to ensure that all rules are defined
|
|
||||||
for (const auto & rule : state.rules) {
|
|
||||||
if (rule.empty()) {
|
|
||||||
throw std::runtime_error("Undefined rule");
|
|
||||||
}
|
|
||||||
for (const auto & elem : rule) {
|
|
||||||
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
|
|
||||||
// Ensure that the rule at that location exists
|
|
||||||
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
|
|
||||||
// Get the name of the rule that is missing
|
|
||||||
for (const auto & kv : state.symbol_ids) {
|
|
||||||
if (kv.second == elem.value) {
|
|
||||||
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return state;
|
|
||||||
} catch (const std::exception & err) {
|
|
||||||
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
|
|
||||||
return parse_state();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_grammar_char(FILE * file, uint32_t c) {
|
|
||||||
if (0x20 <= c && c <= 0x7f) {
|
|
||||||
fprintf(file, "%c", static_cast<char>(c));
|
|
||||||
} else {
|
|
||||||
// cop out of encoding UTF-8
|
|
||||||
fprintf(file, "<U+%04X>", c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool is_char_element(llama_grammar_element elem) {
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_CHAR: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY: return true;
|
|
||||||
default: return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
|
|
||||||
for (auto elem : rule) {
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
|
|
||||||
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
|
|
||||||
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
|
|
||||||
}
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END:
|
|
||||||
case LLAMA_GRETYPE_ALT:
|
|
||||||
case LLAMA_GRETYPE_RULE_REF:
|
|
||||||
fprintf(file, "(%u) ", elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR:
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
fprintf(file, "(\"");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
fprintf(file, "\") ");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_rule(
|
|
||||||
FILE * file,
|
|
||||||
uint32_t rule_id,
|
|
||||||
const std::vector<llama_grammar_element> & rule,
|
|
||||||
const std::map<uint32_t, std::string> & symbol_id_names) {
|
|
||||||
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
|
|
||||||
}
|
|
||||||
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
|
|
||||||
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
|
|
||||||
llama_grammar_element elem = rule[i];
|
|
||||||
switch (elem.type) {
|
|
||||||
case LLAMA_GRETYPE_END:
|
|
||||||
throw std::runtime_error(
|
|
||||||
"unexpected end of rule: " + std::to_string(rule_id) + "," +
|
|
||||||
std::to_string(i));
|
|
||||||
case LLAMA_GRETYPE_ALT:
|
|
||||||
fprintf(file, "| ");
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_RULE_REF:
|
|
||||||
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR:
|
|
||||||
fprintf(file, "[");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_NOT:
|
|
||||||
fprintf(file, "[^");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
|
|
||||||
std::to_string(rule_id) + "," + std::to_string(i));
|
|
||||||
}
|
|
||||||
fprintf(file, "-");
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
if (i == 0 || !is_char_element(rule[i - 1])) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
|
|
||||||
std::to_string(rule_id) + "," + std::to_string(i));
|
|
||||||
}
|
|
||||||
print_grammar_char(file, elem.value);
|
|
||||||
break;
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
fprintf(file, ".");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (is_char_element(elem)) {
|
|
||||||
switch (rule[i + 1].type) {
|
|
||||||
case LLAMA_GRETYPE_CHAR_ALT:
|
|
||||||
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
|
|
||||||
case LLAMA_GRETYPE_CHAR_ANY:
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
fprintf(file, "] ");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_grammar(FILE * file, const parse_state & state) {
|
|
||||||
try {
|
|
||||||
std::map<uint32_t, std::string> symbol_id_names;
|
|
||||||
for (const auto & kv : state.symbol_ids) {
|
|
||||||
symbol_id_names[kv.second] = kv.first;
|
|
||||||
}
|
|
||||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
|
||||||
// fprintf(file, "%zu: ", i);
|
|
||||||
// print_rule_binary(file, state.rules[i]);
|
|
||||||
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
|
|
||||||
// fprintf(file, "\n");
|
|
||||||
}
|
|
||||||
} catch (const std::exception & err) {
|
|
||||||
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> parse_state::c_rules() {
|
|
||||||
std::vector<const llama_grammar_element *> ret;
|
|
||||||
ret.reserve(rules.size());
|
|
||||||
for (const auto & rule : rules) {
|
|
||||||
ret.push_back(rule.data());
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,29 +0,0 @@
|
|||||||
// Implements a parser for an extended Backus-Naur form (BNF), producing the
|
|
||||||
// binary context-free grammar format specified by llama.h. Supports character
|
|
||||||
// ranges, grouping, and repetition operators. As an example, a grammar for
|
|
||||||
// arithmetic might look like:
|
|
||||||
//
|
|
||||||
// root ::= expr
|
|
||||||
// expr ::= term ([-+*/] term)*
|
|
||||||
// term ::= num | "(" space expr ")" space
|
|
||||||
// num ::= [0-9]+ space
|
|
||||||
// space ::= [ \t\n]*
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "llama.h"
|
|
||||||
#include <vector>
|
|
||||||
#include <map>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace grammar_parser {
|
|
||||||
struct parse_state {
|
|
||||||
std::map<std::string, uint32_t> symbol_ids;
|
|
||||||
std::vector<std::vector<llama_grammar_element>> rules;
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> c_rules();
|
|
||||||
};
|
|
||||||
|
|
||||||
parse_state parse(const char * src);
|
|
||||||
void print_grammar(FILE * file, const parse_state & state);
|
|
||||||
}
|
|
401
common/log.cpp
Normal file
401
common/log.cpp
Normal file
@ -0,0 +1,401 @@
|
|||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <mutex>
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity) {
|
||||||
|
gpt_log_verbosity_thold = verbosity;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LOG_COL_DEFAULT "\033[0m"
|
||||||
|
#define LOG_COL_BOLD "\033[1m"
|
||||||
|
#define LOG_COL_RED "\033[31m"
|
||||||
|
#define LOG_COL_GREEN "\033[32m"
|
||||||
|
#define LOG_COL_YELLOW "\033[33m"
|
||||||
|
#define LOG_COL_BLUE "\033[34m"
|
||||||
|
#define LOG_COL_MAGENTA "\033[35m"
|
||||||
|
#define LOG_COL_CYAN "\033[36m"
|
||||||
|
#define LOG_COL_WHITE "\033[37m"
|
||||||
|
|
||||||
|
static int64_t t_us() {
|
||||||
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
// colors
|
||||||
|
enum gpt_log_col : int {
|
||||||
|
GPT_LOG_COL_DEFAULT = 0,
|
||||||
|
GPT_LOG_COL_BOLD,
|
||||||
|
GPT_LOG_COL_RED,
|
||||||
|
GPT_LOG_COL_GREEN,
|
||||||
|
GPT_LOG_COL_YELLOW,
|
||||||
|
GPT_LOG_COL_BLUE,
|
||||||
|
GPT_LOG_COL_MAGENTA,
|
||||||
|
GPT_LOG_COL_CYAN,
|
||||||
|
GPT_LOG_COL_WHITE,
|
||||||
|
};
|
||||||
|
|
||||||
|
// disable colors by default
|
||||||
|
static std::vector<const char *> g_col = {
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log_entry {
|
||||||
|
enum ggml_log_level level;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
|
||||||
|
int64_t timestamp;
|
||||||
|
|
||||||
|
std::vector<char> msg;
|
||||||
|
|
||||||
|
// signals the worker thread to stop
|
||||||
|
bool is_end;
|
||||||
|
|
||||||
|
void print(FILE * file = nullptr) const {
|
||||||
|
FILE * fcur = file;
|
||||||
|
if (!fcur) {
|
||||||
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
||||||
|
// these messages will still be logged to a file
|
||||||
|
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fcur = stdout;
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE) {
|
||||||
|
fcur = stderr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE && prefix) {
|
||||||
|
if (timestamp) {
|
||||||
|
// [M.s.ms.us]
|
||||||
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
||||||
|
g_col[GPT_LOG_COL_BLUE],
|
||||||
|
(int) (timestamp / 1000000 / 60),
|
||||||
|
(int) (timestamp / 1000000 % 60),
|
||||||
|
(int) (timestamp / 1000 % 1000),
|
||||||
|
(int) (timestamp % 1000),
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (level) {
|
||||||
|
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
||||||
|
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fcur, "%s", msg.data());
|
||||||
|
|
||||||
|
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
||||||
|
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(fcur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log {
|
||||||
|
// default capacity - will be expanded if needed
|
||||||
|
gpt_log() : gpt_log(256) {}
|
||||||
|
|
||||||
|
gpt_log(size_t capacity) {
|
||||||
|
file = nullptr;
|
||||||
|
prefix = false;
|
||||||
|
timestamps = false;
|
||||||
|
running = false;
|
||||||
|
t_start = t_us();
|
||||||
|
|
||||||
|
// initial message size - will be expanded if longer messages arrive
|
||||||
|
entries.resize(capacity);
|
||||||
|
for (auto & entry : entries) {
|
||||||
|
entry.msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = 0;
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
~gpt_log() {
|
||||||
|
pause();
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::mutex mtx;
|
||||||
|
std::thread thrd;
|
||||||
|
std::condition_variable cv;
|
||||||
|
|
||||||
|
FILE * file;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
bool timestamps;
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
int64_t t_start;
|
||||||
|
|
||||||
|
// ring buffer of entries
|
||||||
|
std::vector<gpt_log_entry> entries;
|
||||||
|
size_t head;
|
||||||
|
size_t tail;
|
||||||
|
|
||||||
|
// worker thread copies into this
|
||||||
|
gpt_log_entry cur;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
// discard messages while the worker thread is paused
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
|
||||||
|
{
|
||||||
|
// cannot use args twice, so make a copy in case we need to expand the buffer
|
||||||
|
va_list args_copy;
|
||||||
|
va_copy(args_copy, args);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// hack for bolding arguments
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
for (int i = 0; fmt[i] != 0; i++) {
|
||||||
|
if (fmt[i] == '%') {
|
||||||
|
ss << LOG_COL_BOLD;
|
||||||
|
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
||||||
|
ss << LOG_COL_DEFAULT;
|
||||||
|
if (fmt[i] == 0) break;
|
||||||
|
}
|
||||||
|
ss << fmt[i];
|
||||||
|
}
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
entry.level = level;
|
||||||
|
entry.prefix = prefix;
|
||||||
|
entry.timestamp = 0;
|
||||||
|
if (timestamps) {
|
||||||
|
entry.timestamp = t_us() - t_start;
|
||||||
|
}
|
||||||
|
entry.is_end = false;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
if (tail == head) {
|
||||||
|
// expand the buffer
|
||||||
|
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
||||||
|
|
||||||
|
size_t new_tail = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
new_entries[new_tail] = std::move(entries[head]);
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
new_tail = (new_tail + 1);
|
||||||
|
} while (head != tail);
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = new_tail;
|
||||||
|
|
||||||
|
for (size_t i = tail; i < new_entries.size(); i++) {
|
||||||
|
new_entries[i].msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = std::move(new_entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
void resume() {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = true;
|
||||||
|
|
||||||
|
thrd = std::thread([this]() {
|
||||||
|
while (true) {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
cv.wait(lock, [this]() { return head != tail; });
|
||||||
|
|
||||||
|
cur = entries[head];
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur.is_end) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur.print(); // stdout and stderr
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
cur.print(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void pause() {
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = false;
|
||||||
|
|
||||||
|
// push an entry to signal the worker thread to stop
|
||||||
|
{
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
entry.is_end = true;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
thrd.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_file(const char * path) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path) {
|
||||||
|
file = fopen(path, "w");
|
||||||
|
} else {
|
||||||
|
file = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_colors(bool colors) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (colors) {
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
||||||
|
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
||||||
|
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
||||||
|
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
||||||
|
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
||||||
|
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
||||||
|
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
||||||
|
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||||
|
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < g_col.size(); i++) {
|
||||||
|
g_col[i] = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_prefix(bool prefix) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->prefix = prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_timestamps(bool timestamps) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->timestamps = timestamps;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// public API
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init() {
|
||||||
|
return new gpt_log;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_main() {
|
||||||
|
static struct gpt_log log;
|
||||||
|
|
||||||
|
return &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_pause(struct gpt_log * log) {
|
||||||
|
log->pause();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_resume(struct gpt_log * log) {
|
||||||
|
log->resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_free(struct gpt_log * log) {
|
||||||
|
delete log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
log->add(level, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
||||||
|
log->set_file(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
||||||
|
log->set_colors(colors);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
||||||
|
log->set_prefix(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
||||||
|
log->set_timestamps(timestamps);
|
||||||
|
}
|
786
common/log.h
786
common/log.h
@ -1,724 +1,90 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <chrono>
|
#include "ggml.h" // for ggml_log_level
|
||||||
#include <cstring>
|
|
||||||
#include <sstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <thread>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cinttypes>
|
|
||||||
|
|
||||||
// --------------------------------
|
#ifndef __GNUC__
|
||||||
//
|
# define LOG_ATTRIBUTE_FORMAT(...)
|
||||||
// Basic usage:
|
#elif defined(__MINGW32__)
|
||||||
//
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
// --------
|
|
||||||
//
|
|
||||||
// The LOG() and LOG_TEE() macros are ready to go by default
|
|
||||||
// they do not require any initialization.
|
|
||||||
//
|
|
||||||
// LOGLN() and LOG_TEELN() are variants which automatically
|
|
||||||
// include \n character at the end of the log string.
|
|
||||||
//
|
|
||||||
// LOG() behaves exactly like printf, by default writing to a logfile.
|
|
||||||
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
|
||||||
//
|
|
||||||
// Default logfile is named
|
|
||||||
// "llama.<threadID>.log"
|
|
||||||
// Default LOG_TEE() secondary output target is
|
|
||||||
// stderr
|
|
||||||
//
|
|
||||||
// Logs can be dynamically disabled or enabled using functions:
|
|
||||||
// log_disable()
|
|
||||||
// and
|
|
||||||
// log_enable()
|
|
||||||
//
|
|
||||||
// A log target can be changed with:
|
|
||||||
// log_set_target( string )
|
|
||||||
// creating and opening, or re-opening a file by string filename
|
|
||||||
// or
|
|
||||||
// log_set_target( FILE* )
|
|
||||||
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
|
||||||
//
|
|
||||||
// --------
|
|
||||||
//
|
|
||||||
// End of Basic usage.
|
|
||||||
//
|
|
||||||
// --------------------------------
|
|
||||||
|
|
||||||
// Specifies a log target.
|
|
||||||
// default uses log_handler() with "llama.log" log file
|
|
||||||
// this can be changed, by defining LOG_TARGET
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET (a valid FILE*)
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// or it can be simply redirected to stdout or stderr
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET stderr
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// The log target can also be redirected to a different function
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET log_handler_different()
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// FILE* log_handler_different()
|
|
||||||
// {
|
|
||||||
// return stderr;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// or:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// FILE* log_handler_another_one(char*filename)
|
|
||||||
// {
|
|
||||||
// static FILE* logfile = nullptr;
|
|
||||||
// (...)
|
|
||||||
// if( !logfile )
|
|
||||||
// {
|
|
||||||
// fopen(...)
|
|
||||||
// }
|
|
||||||
// (...)
|
|
||||||
// return logfile
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
#ifndef LOG_TARGET
|
|
||||||
#define LOG_TARGET log_handler()
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOG_TEE_TARGET
|
|
||||||
#define LOG_TEE_TARGET stderr
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Utility for synchronizing log configuration state
|
|
||||||
// since std::optional was introduced only in c++17
|
|
||||||
enum LogTriState
|
|
||||||
{
|
|
||||||
LogTriStateSame,
|
|
||||||
LogTriStateFalse,
|
|
||||||
LogTriStateTrue
|
|
||||||
};
|
|
||||||
|
|
||||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
|
||||||
inline std::string log_get_pid()
|
|
||||||
{
|
|
||||||
static std::string pid;
|
|
||||||
if (pid.empty())
|
|
||||||
{
|
|
||||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
|
||||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
|
||||||
// trying to write to the same log.
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << std::this_thread::get_id();
|
|
||||||
pid = ss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
return pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Utility function for generating log file names with unique id based on thread id.
|
|
||||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
|
||||||
// where the number is a runtime id of the current thread.
|
|
||||||
|
|
||||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
|
||||||
{
|
|
||||||
static bool _multilog = false;
|
|
||||||
|
|
||||||
if (multilog != LogTriStateSame)
|
|
||||||
{
|
|
||||||
_multilog = multilog == LogTriStateTrue;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::stringstream buf;
|
|
||||||
|
|
||||||
buf << log_file_basename;
|
|
||||||
if (_multilog)
|
|
||||||
{
|
|
||||||
buf << ".";
|
|
||||||
buf << log_get_pid();
|
|
||||||
}
|
|
||||||
buf << ".";
|
|
||||||
buf << log_file_extension;
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOG_DEFAULT_FILE_NAME
|
|
||||||
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Utility for turning #define values into string literals
|
|
||||||
// so we can have a define for stderr and
|
|
||||||
// we can print "stderr" instead of literal stderr, etc.
|
|
||||||
#define LOG_STRINGIZE1(s) #s
|
|
||||||
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
|
||||||
|
|
||||||
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
|
||||||
|
|
||||||
// Allows disabling timestamps.
|
|
||||||
// in order to disable, define LOG_NO_TIMESTAMPS
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_NO_TIMESTAMPS
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
#ifndef LOG_NO_TIMESTAMPS
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#else
|
|
||||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
#define LOG_TIMESTAMP_FMT "%s"
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
#define LOG_TIMESTAMP_VAL ,""
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LOG_TEE_TIMESTAMPS
|
#define LOG_DEFAULT_DEBUG 1
|
||||||
#ifndef _MSC_VER
|
#define LOG_DEFAULT_LLAMA 0
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Allows disabling file/line/function prefix
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
// set via gpt_log_set_verbosity()
|
||||||
// like so:
|
extern int gpt_log_verbosity_thold;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||||
|
|
||||||
|
// the gpt_log uses an internal worker thread to print/write log messages
|
||||||
|
// when the worker thread is paused, incoming log messages are discarded
|
||||||
|
struct gpt_log;
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init();
|
||||||
|
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
|
||||||
|
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
|
||||||
|
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
|
||||||
|
void gpt_log_free (struct gpt_log * log);
|
||||||
|
|
||||||
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
|
||||||
|
|
||||||
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
||||||
//
|
//
|
||||||
// #define LOG_NO_FILE_LINE_FUNCTION
|
// regular log output:
|
||||||
// #include "log.h"
|
|
||||||
//
|
//
|
||||||
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
#ifndef _MSC_VER
|
// llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
// llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
// llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
#else
|
|
||||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
|
||||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_FLF_FMT "%s"
|
|
||||||
#define LOG_FLF_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_FLF_FMT "%s"
|
|
||||||
#define LOG_TEE_FLF_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
// USE LOG() INSTEAD
|
|
||||||
//
|
//
|
||||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
// with prefix = true, timestamps = true, the log output will look like this:
|
||||||
#define LOG_IMPL(str, ...) \
|
//
|
||||||
do { \
|
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
if (LOG_TARGET != nullptr) \
|
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
{ \
|
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
fflush(LOG_TARGET); \
|
//
|
||||||
} \
|
// I - info (stdout, V = 0)
|
||||||
|
// W - warning (stderr, V = 0)
|
||||||
|
// E - error (stderr, V = 0)
|
||||||
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||||
|
//
|
||||||
|
|
||||||
|
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
|
||||||
|
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
|
||||||
|
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
|
||||||
|
// helper macros for logging
|
||||||
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
//
|
||||||
|
// for example:
|
||||||
|
//
|
||||||
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
||||||
|
//
|
||||||
|
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
|
||||||
|
//
|
||||||
|
|
||||||
|
#define LOG_TMPL(level, verbosity, ...) \
|
||||||
|
do { \
|
||||||
|
if ((verbosity) <= gpt_log_verbosity_thold) { \
|
||||||
|
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
#else
|
|
||||||
#define LOG_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||||
// USE LOG_TEE() INSTEAD
|
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||||
//
|
|
||||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
|
||||||
#define LOG_TEE_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
|
||||||
fflush(LOG_TEE_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TEE_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// The '\0' as a last argument, is a trick to bypass the silly
|
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||||
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||||
// so we can have a single macro which can be called just like printf.
|
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||||
|
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||||
|
|
||||||
// Main LOG macro.
|
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||||
// behaves like printf, and supports arguments the exact same way.
|
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||||
//
|
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
|
||||||
#else
|
|
||||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Main TEE macro.
|
|
||||||
// does the same as LOG
|
|
||||||
// and
|
|
||||||
// simultaneously writes stderr.
|
|
||||||
//
|
|
||||||
// Secondary target can be changed just like LOG_TARGET
|
|
||||||
// by defining LOG_TEE_TARGET
|
|
||||||
//
|
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
|
||||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
|
||||||
#else
|
|
||||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// LOG macro variants with auto endline.
|
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
|
||||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
|
||||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
|
||||||
#else
|
|
||||||
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
|
||||||
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
|
||||||
{
|
|
||||||
static bool _initialized = false;
|
|
||||||
static bool _append = false;
|
|
||||||
static bool _disabled = filename.empty() && target == nullptr;
|
|
||||||
static std::string log_current_filename{filename};
|
|
||||||
static FILE *log_current_target{target};
|
|
||||||
static FILE *logfile = nullptr;
|
|
||||||
|
|
||||||
if (change)
|
|
||||||
{
|
|
||||||
if (append != LogTriStateSame)
|
|
||||||
{
|
|
||||||
_append = append == LogTriStateTrue;
|
|
||||||
return logfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (disable == LogTriStateTrue)
|
|
||||||
{
|
|
||||||
// Disable primary target
|
|
||||||
_disabled = true;
|
|
||||||
}
|
|
||||||
// If previously disabled, only enable, and keep previous target
|
|
||||||
else if (disable == LogTriStateFalse)
|
|
||||||
{
|
|
||||||
_disabled = false;
|
|
||||||
}
|
|
||||||
// Otherwise, process the arguments
|
|
||||||
else if (log_current_filename != filename || log_current_target != target)
|
|
||||||
{
|
|
||||||
_initialized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_disabled)
|
|
||||||
{
|
|
||||||
// Log is disabled
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_initialized)
|
|
||||||
{
|
|
||||||
// with fallback in case something went wrong
|
|
||||||
return logfile ? logfile : stderr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the (re)initialization
|
|
||||||
if (target != nullptr)
|
|
||||||
{
|
|
||||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
|
||||||
{
|
|
||||||
fclose(logfile);
|
|
||||||
}
|
|
||||||
|
|
||||||
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
|
||||||
log_current_target = target;
|
|
||||||
|
|
||||||
logfile = target;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (log_current_filename != filename)
|
|
||||||
{
|
|
||||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
|
||||||
{
|
|
||||||
fclose(logfile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!logfile)
|
|
||||||
{
|
|
||||||
// Verify whether the file was opened, otherwise fallback to stderr
|
|
||||||
logfile = stderr;
|
|
||||||
|
|
||||||
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
|
||||||
// otherwise we would repeatedly fopen() which was already unsuccessful
|
|
||||||
}
|
|
||||||
|
|
||||||
_initialized = true;
|
|
||||||
|
|
||||||
return logfile ? logfile : stderr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
|
||||||
{
|
|
||||||
return log_handler1_impl(change, append, disable, filename, target);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Disables logs entirely at runtime.
|
|
||||||
// Makes LOG() and LOG_TEE() produce no output,
|
|
||||||
// until enabled back.
|
|
||||||
#define log_disable() log_disable_impl()
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_disable_impl()
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enables logs at runtime.
|
|
||||||
#define log_enable() log_enable_impl()
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_enable_impl()
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
|
||||||
#define log_set_target(target) log_set_target_impl(target)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
|
||||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
|
||||||
|
|
||||||
// Enable or disable creating separate log files for each run.
|
|
||||||
// can ONLY be invoked BEFORE first log use.
|
|
||||||
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
|
||||||
// Enable or disable append mode for log file.
|
|
||||||
// can ONLY be invoked BEFORE first log use.
|
|
||||||
#define log_append(enable) log_append_impl(enable)
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_append_impl(bool enable)
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void log_test()
|
|
||||||
{
|
|
||||||
log_disable();
|
|
||||||
LOG("01 Hello World to nobody, because logs are disabled!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
|
|
||||||
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
|
|
||||||
log_set_target(stderr);
|
|
||||||
LOG("04 Hello World to stderr!\n");
|
|
||||||
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
|
|
||||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
|
||||||
LOG("06 Hello World to default log file!\n");
|
|
||||||
log_set_target(stdout);
|
|
||||||
LOG("07 Hello World to stdout!\n");
|
|
||||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
|
||||||
LOG("08 Hello World to default log file again!\n");
|
|
||||||
log_disable();
|
|
||||||
LOG("09 Hello World _1_ into the void!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
|
|
||||||
log_disable();
|
|
||||||
log_set_target("llama.anotherlog.log");
|
|
||||||
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
|
|
||||||
log_set_target("llama.yetanotherlog.log");
|
|
||||||
LOG("13 Hello World this time in yet new file?\n");
|
|
||||||
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
|
||||||
LOG("14 Hello World in log with generated filename!\n");
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
LOG_TEE("15 Hello msvc TEE without arguments\n");
|
|
||||||
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
|
|
||||||
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOG("19 Hello msvc LOG without arguments\n");
|
|
||||||
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOGLN("21 Hello msvc LOGLN without arguments\n");
|
|
||||||
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool log_param_single_parse(const std::string & param)
|
|
||||||
{
|
|
||||||
if ( param == "--log-test")
|
|
||||||
{
|
|
||||||
log_test();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( param == "--log-disable")
|
|
||||||
{
|
|
||||||
log_disable();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( param == "--log-enable")
|
|
||||||
{
|
|
||||||
log_enable();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (param == "--log-new")
|
|
||||||
{
|
|
||||||
log_multilog(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (param == "--log-append")
|
|
||||||
{
|
|
||||||
log_append(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
|
||||||
{
|
|
||||||
if ( param == "--log-file")
|
|
||||||
{
|
|
||||||
if (!check_but_dont_parse)
|
|
||||||
{
|
|
||||||
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void log_print_usage()
|
|
||||||
{
|
|
||||||
printf("log options:\n");
|
|
||||||
/* format
|
|
||||||
printf(" -h, --help show this help message and exit\n");*/
|
|
||||||
/* spacing
|
|
||||||
printf("__-param----------------Description\n");*/
|
|
||||||
printf(" --log-test Run simple logging test\n");
|
|
||||||
printf(" --log-disable Disable trace logs\n");
|
|
||||||
printf(" --log-enable Enable trace logs\n");
|
|
||||||
printf(" --log-file Specify a log filename (without extension)\n");
|
|
||||||
printf(" --log-new Create a separate new log file on start. "
|
|
||||||
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
|
||||||
printf(" --log-append Don't truncate the old log file.\n");
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline void log_dump_cmdline_impl(int argc, char **argv)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
for (int i = 0; i < argc; ++i)
|
|
||||||
{
|
|
||||||
if (std::string(argv[i]).find(' ') != std::string::npos)
|
|
||||||
{
|
|
||||||
buf << " \"" << argv[i] <<"\"";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buf << " " << argv[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOGLN("Cmd:%s", buf.str().c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(bool var)
|
|
||||||
{
|
|
||||||
return var ? "true" : "false";
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(std::string var)
|
|
||||||
{
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
bool first = true;
|
|
||||||
for (auto e : var)
|
|
||||||
{
|
|
||||||
if (first)
|
|
||||||
{
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buf << ", ";
|
|
||||||
}
|
|
||||||
buf << std::to_string(e);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename C, typename T>
|
|
||||||
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
|
|
||||||
bool first = true;
|
|
||||||
for (const auto & token : tokens)
|
|
||||||
{
|
|
||||||
if (!first) {
|
|
||||||
buf << ", ";
|
|
||||||
} else {
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, token);
|
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf
|
|
||||||
<< "'" << detokenized << "'"
|
|
||||||
<< ":" << std::to_string(token);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename C, typename B>
|
|
||||||
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
|
|
||||||
bool first = true;
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i)
|
|
||||||
{
|
|
||||||
if (!first) {
|
|
||||||
buf << ", ";
|
|
||||||
} else {
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf
|
|
||||||
<< "\n" << std::to_string(i)
|
|
||||||
<< ":token '" << detokenized << "'"
|
|
||||||
<< ":pos " << std::to_string(batch.pos[i])
|
|
||||||
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
||||||
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
|
||||||
<< ":logits " << std::to_string(batch.logits[i]);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
#undef LOG
|
|
||||||
#define LOG(...) // dummy stub
|
|
||||||
#undef LOGLN
|
|
||||||
#define LOGLN(...) // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_TEE
|
|
||||||
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
|
||||||
|
|
||||||
#undef LOG_TEELN
|
|
||||||
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
|
||||||
|
|
||||||
#undef LOG_DISABLE
|
|
||||||
#define LOG_DISABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_ENABLE
|
|
||||||
#define LOG_ENABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_ENABLE
|
|
||||||
#define LOG_ENABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_SET_TARGET
|
|
||||||
#define LOG_SET_TARGET(...) // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_DUMP_CMDLINE
|
|
||||||
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
|
||||||
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
@ -2,8 +2,11 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||||
|
@ -1,460 +1,450 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include <random>
|
|
||||||
|
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
#include "common.h"
|
||||||
struct llama_sampling_context * result = new llama_sampling_context();
|
|
||||||
|
|
||||||
result->params = params;
|
#include <cmath>
|
||||||
result->grammar = nullptr;
|
#include <unordered_map>
|
||||||
|
|
||||||
// if there is a grammar, parse it
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
if (!params.grammar.empty()) {
|
// TODO: deduplicate with llama-impl.h
|
||||||
result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
template<typename T>
|
||||||
|
struct ring_buffer {
|
||||||
|
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
|
||||||
|
|
||||||
// will be empty (default) if there are parse errors
|
T & front() {
|
||||||
if (result->parsed_grammar.rules.empty()) {
|
if (sz == 0) {
|
||||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
throw std::runtime_error("ring buffer is empty");
|
||||||
delete result;
|
}
|
||||||
return nullptr;
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & front() const {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[first];
|
||||||
|
}
|
||||||
|
|
||||||
|
T & back() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & back() const {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
return data[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
void push_back(const T & value) {
|
||||||
|
if (sz == capacity) {
|
||||||
|
// advance the start when buffer is full
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
} else {
|
||||||
|
sz++;
|
||||||
|
}
|
||||||
|
data[pos] = value;
|
||||||
|
pos = (pos + 1) % capacity;
|
||||||
|
}
|
||||||
|
|
||||||
|
T pop_front() {
|
||||||
|
if (sz == 0) {
|
||||||
|
throw std::runtime_error("ring buffer is empty");
|
||||||
|
}
|
||||||
|
T value = data[first];
|
||||||
|
first = (first + 1) % capacity;
|
||||||
|
sz--;
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
const T & rat(size_t i) const {
|
||||||
|
if (i >= sz) {
|
||||||
|
throw std::runtime_error("ring buffer: index out of bounds");
|
||||||
|
}
|
||||||
|
return data[(first + sz - i - 1) % capacity];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<T> to_vector() const {
|
||||||
|
std::vector<T> result;
|
||||||
|
result.reserve(sz);
|
||||||
|
for (size_t i = 0; i < sz; i++) {
|
||||||
|
result.push_back(data[(first + i) % capacity]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear() {
|
||||||
|
// here only reset the status of the buffer
|
||||||
|
sz = 0;
|
||||||
|
first = 0;
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool empty() const {
|
||||||
|
return sz == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t capacity = 0;
|
||||||
|
size_t sz = 0;
|
||||||
|
size_t first = 0;
|
||||||
|
size_t pos = 0;
|
||||||
|
std::vector<T> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_sampler {
|
||||||
|
gpt_sampler_params params;
|
||||||
|
|
||||||
|
struct llama_sampler * grmr;
|
||||||
|
struct llama_sampler * chain;
|
||||||
|
|
||||||
|
ring_buffer<llama_token> prev;
|
||||||
|
|
||||||
|
std::vector<llama_token_data> cur;
|
||||||
|
|
||||||
|
llama_token_data_array cur_p;
|
||||||
|
|
||||||
|
void set_logits(struct llama_context * ctx, int idx) {
|
||||||
|
const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
|
cur.resize(n_vocab);
|
||||||
|
|
||||||
|
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that there is a "root" node.
|
cur_p = { cur.data(), cur.size(), -1, false };
|
||||||
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
|
|
||||||
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
|
||||||
delete result;
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
struct llama_grammar * grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
||||||
}
|
|
||||||
result->grammar = grammar;
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
std::string gpt_sampler_params::print() const {
|
||||||
|
|
||||||
result->n_valid = 0;
|
|
||||||
|
|
||||||
llama_sampling_set_rng_seed(result, params.seed);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx) {
|
|
||||||
if (ctx->grammar != NULL) {
|
|
||||||
llama_grammar_free(ctx->grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx) {
|
|
||||||
if (ctx->grammar != NULL) {
|
|
||||||
llama_grammar_free(ctx->grammar);
|
|
||||||
ctx->grammar = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
struct llama_grammar * grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
|
||||||
}
|
|
||||||
ctx->grammar = grammar;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
|
||||||
ctx->cur.clear();
|
|
||||||
ctx->n_valid = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
|
||||||
if (seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
seed = std::random_device{}();
|
|
||||||
}
|
|
||||||
ctx->rng.seed(seed);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
|
|
||||||
if (dst->grammar) {
|
|
||||||
llama_grammar_free(dst->grammar);
|
|
||||||
dst->grammar = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src->grammar) {
|
|
||||||
dst->grammar = llama_grammar_copy(src->grammar);
|
|
||||||
}
|
|
||||||
|
|
||||||
dst->prev = src->prev;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
|
||||||
return ctx->prev.back();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
|
||||||
const int size = ctx_sampling->prev.size();
|
|
||||||
|
|
||||||
n = std::min(n, size);
|
|
||||||
|
|
||||||
std::string result;
|
|
||||||
|
|
||||||
for (int i = size - n; i < size; i++) {
|
|
||||||
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string llama_sampling_print(const llama_sampling_params & params) {
|
|
||||||
char result[1024];
|
char result[1024];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
|
||||||
params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
|
top_k, tfs_z, top_p, min_p, typ_p, temp,
|
||||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
mirostat, mirostat_eta, mirostat_tau);
|
||||||
|
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||||
std::string result = "CFG -> Penalties ";
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
if (params.mirostat == 0) {
|
|
||||||
for (auto sampler_type : params.samplers_sequence) {
|
lparams.no_perf = params.no_perf;
|
||||||
const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
|
|
||||||
if (!sampler_type_name.empty()) {
|
auto * result = new gpt_sampler {
|
||||||
result += "-> " + sampler_type_name + " ";
|
/* .params = */ params,
|
||||||
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||||
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||||
|
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||||
|
/* .cur = */ {},
|
||||||
|
/* .cur_p = */ {},
|
||||||
|
};
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_logit_bias(
|
||||||
|
llama_n_vocab(model),
|
||||||
|
params.logit_bias.size(),
|
||||||
|
params.logit_bias.data()));
|
||||||
|
|
||||||
|
llama_sampler_chain_add(result->chain,
|
||||||
|
llama_sampler_init_penalties(
|
||||||
|
llama_n_vocab (model),
|
||||||
|
llama_token_eos(model),
|
||||||
|
llama_token_nl (model),
|
||||||
|
params.penalty_last_n,
|
||||||
|
params.penalty_repeat,
|
||||||
|
params.penalty_freq,
|
||||||
|
params.penalty_present,
|
||||||
|
params.penalize_nl,
|
||||||
|
params.ignore_eos));
|
||||||
|
|
||||||
|
if (params.temp > 0.0f) {
|
||||||
|
if (params.mirostat == 0) {
|
||||||
|
for (const auto & cnstr : params.samplers) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
|
break;
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false && "unknown sampler type");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
||||||
|
} else if (params.mirostat == 1) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
|
||||||
|
} else if (params.mirostat == 2) {
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false && "unknown mirostat version");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
result += "-> mirostat ";
|
llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
|
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
||||||
switch (sampler_type) {
|
if (gsmpl) {
|
||||||
case llama_sampler_type::TOP_K: return "top_k";
|
llama_sampler_free(gsmpl->grmr);
|
||||||
case llama_sampler_type::TFS_Z: return "tfs_z";
|
|
||||||
case llama_sampler_type::TYPICAL_P: return "typical_p";
|
llama_sampler_free(gsmpl->chain);
|
||||||
case llama_sampler_type::TOP_P: return "top_p";
|
|
||||||
case llama_sampler_type::MIN_P: return "min_p";
|
delete gsmpl;
|
||||||
case llama_sampler_type::TEMPERATURE: return "temperature";
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
|
if (accept_grammar) {
|
||||||
|
llama_sampler_accept(gsmpl->grmr, token);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_accept(gsmpl->chain, token);
|
||||||
|
|
||||||
|
gsmpl->prev.push_back(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
|
||||||
|
llama_sampler_reset(gsmpl->grmr);
|
||||||
|
|
||||||
|
llama_sampler_reset(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
||||||
|
return new gpt_sampler {
|
||||||
|
/* .params = */ gsmpl->params,
|
||||||
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||||
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
/* .prev = */ gsmpl->prev,
|
||||||
|
/* .cur = */ gsmpl->cur,
|
||||||
|
/* .cur_p = */ gsmpl->cur_p,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
||||||
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
|
if (gsmpl) {
|
||||||
|
llama_perf_sampler_print(gsmpl->chain);
|
||||||
|
}
|
||||||
|
if (ctx) {
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
auto & grmr = gsmpl->grmr;
|
||||||
|
auto & chain = gsmpl->chain;
|
||||||
|
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
const llama_token id = cur_p.data[cur_p.selected].id;
|
||||||
|
|
||||||
|
if (grammar_first) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if it the sampled token fits the grammar
|
||||||
|
{
|
||||||
|
llama_token_data single_token_data = { id, 1.0f, 0.0f };
|
||||||
|
llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &single_token_data_array);
|
||||||
|
|
||||||
|
const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
||||||
|
if (is_valid) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// resampling:
|
||||||
|
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||||
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
|
llama_sampler_apply(grmr, &cur_p);
|
||||||
|
llama_sampler_apply(chain, &cur_p);
|
||||||
|
|
||||||
|
GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
|
||||||
|
|
||||||
|
return cur_p.data[cur_p.selected].id;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
||||||
|
return llama_sampler_get_seed(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||||
|
return &gsmpl->cur_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||||
|
return gsmpl->prev.rat(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
|
std::string result = "logits ";
|
||||||
|
|
||||||
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
result += std::string("-> ") + llama_sampler_name(smpl) + " ";
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||||
|
n = std::min(n, (int) gsmpl->prev.size());
|
||||||
|
|
||||||
|
if (n <= 0) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
|
||||||
|
|
||||||
|
for (int i = n - 1; i >= 0; i--) {
|
||||||
|
const llama_token id = gsmpl->prev.rat(i);
|
||||||
|
|
||||||
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||||
|
|
||||||
|
result += llama_token_to_piece(ctx_main, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
|
default : return '?';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
||||||
|
switch (cnstr) {
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
|
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
||||||
|
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
|
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
|
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
|
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
|
||||||
{"top_k", llama_sampler_type::TOP_K},
|
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{"top_p", llama_sampler_type::TOP_P},
|
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"typical_p", llama_sampler_type::TYPICAL_P},
|
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"min_p", llama_sampler_type::MIN_P},
|
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{"tfs_z", llama_sampler_type::TFS_Z},
|
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
{"temperature", llama_sampler_type::TEMPERATURE}
|
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
|
||||||
{"top-k", llama_sampler_type::TOP_K},
|
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{"top-p", llama_sampler_type::TOP_P},
|
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"nucleus", llama_sampler_type::TOP_P},
|
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
|
||||||
{"typical-p", llama_sampler_type::TYPICAL_P},
|
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"typical", llama_sampler_type::TYPICAL_P},
|
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"min-p", llama_sampler_type::MIN_P},
|
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"tfs-z", llama_sampler_type::TFS_Z},
|
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{"tfs", llama_sampler_type::TFS_Z},
|
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
|
||||||
{"temp", llama_sampler_type::TEMPERATURE}
|
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types;
|
std::vector<gpt_sampler_type> samplers;
|
||||||
sampler_types.reserve(names.size());
|
samplers.reserve(names.size());
|
||||||
for (const auto & name : names)
|
|
||||||
{
|
|
||||||
auto sampler_item = sampler_canonical_name_map.find(name);
|
|
||||||
if (sampler_item != sampler_canonical_name_map.end())
|
|
||||||
{
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (allow_alt_names)
|
|
||||||
{
|
|
||||||
sampler_item = sampler_alt_name_map.find(name);
|
|
||||||
if (sampler_item != sampler_alt_name_map.end())
|
|
||||||
{
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sampler_types;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
|
for (const auto & name : names) {
|
||||||
std::unordered_map<char, llama_sampler_type> sampler_name_map {
|
auto sampler = sampler_canonical_name_map.find(name);
|
||||||
{'k', llama_sampler_type::TOP_K},
|
if (sampler != sampler_canonical_name_map.end()) {
|
||||||
{'p', llama_sampler_type::TOP_P},
|
samplers.push_back(sampler->second);
|
||||||
{'y', llama_sampler_type::TYPICAL_P},
|
|
||||||
{'m', llama_sampler_type::MIN_P},
|
|
||||||
{'f', llama_sampler_type::TFS_Z},
|
|
||||||
{'t', llama_sampler_type::TEMPERATURE}
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> sampler_types;
|
|
||||||
sampler_types.reserve(names_string.size());
|
|
||||||
for (const auto & c : names_string) {
|
|
||||||
const auto sampler_item = sampler_name_map.find(c);
|
|
||||||
if (sampler_item != sampler_name_map.end()) {
|
|
||||||
sampler_types.push_back(sampler_item->second);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sampler_types;
|
|
||||||
}
|
|
||||||
|
|
||||||
// no reasons to expose this function in header
|
|
||||||
static void sampler_queue(
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
const llama_sampling_params & params,
|
|
||||||
llama_token_data_array & cur_p,
|
|
||||||
size_t min_keep) {
|
|
||||||
const float temp = params.temp;
|
|
||||||
const float dynatemp_range = params.dynatemp_range;
|
|
||||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
|
||||||
const int32_t top_k = params.top_k;
|
|
||||||
const float top_p = params.top_p;
|
|
||||||
const float min_p = params.min_p;
|
|
||||||
const float tfs_z = params.tfs_z;
|
|
||||||
const float typical_p = params.typical_p;
|
|
||||||
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
|
|
||||||
|
|
||||||
for (auto sampler_type : samplers_sequence) {
|
|
||||||
switch (sampler_type) {
|
|
||||||
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
|
||||||
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
|
||||||
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
|
||||||
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
|
||||||
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
|
||||||
case llama_sampler_type::TEMPERATURE:
|
|
||||||
if (dynatemp_range > 0) {
|
|
||||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
|
||||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
|
||||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
|
||||||
} else {
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default : break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_token llama_sampling_sample_impl(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool is_resampling) {
|
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
|
||||||
|
|
||||||
const float temp = params.temp;
|
|
||||||
const int mirostat = params.mirostat;
|
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
|
||||||
|
|
||||||
std::vector<float> original_logits;
|
|
||||||
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
|
|
||||||
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
||||||
GGML_ASSERT(!original_logits.empty());
|
|
||||||
}
|
|
||||||
llama_token id = 0;
|
|
||||||
|
|
||||||
if (temp < 0.0) {
|
|
||||||
// greedy sampling, with probs
|
|
||||||
llama_sample_softmax(ctx_main, &cur_p);
|
|
||||||
id = cur_p.data[0].id;
|
|
||||||
} else if (temp == 0.0) {
|
|
||||||
// greedy sampling, no probs
|
|
||||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
|
||||||
} else {
|
|
||||||
if (mirostat == 1) {
|
|
||||||
const int mirostat_m = 100;
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
|
|
||||||
} else if (mirostat == 2) {
|
|
||||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
|
||||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
|
||||||
} else {
|
} else {
|
||||||
// temperature sampling
|
if (allow_alt_names) {
|
||||||
size_t min_keep = std::max(1, params.min_keep);
|
sampler = sampler_alt_name_map.find(name);
|
||||||
|
if (sampler != sampler_alt_name_map.end()) {
|
||||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
samplers.push_back(sampler->second);
|
||||||
|
|
||||||
id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
|
|
||||||
|
|
||||||
//{
|
|
||||||
// const int n_top = 10;
|
|
||||||
// LOG("top %d candidates:\n", n_top);
|
|
||||||
|
|
||||||
// for (int i = 0; i < n_top; i++) {
|
|
||||||
// const llama_token id = cur_p.data[i].id;
|
|
||||||
// (void)id; // To avoid a warning that id is unused when logging is disabled.
|
|
||||||
// LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
//LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && !is_resampling) {
|
|
||||||
// Get a pointer to the logits
|
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
||||||
|
|
||||||
// Create an array with a single token data element for the sampled id
|
|
||||||
llama_token_data single_token_data = {id, logits[id], 0.0f};
|
|
||||||
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
|
|
||||||
|
|
||||||
// Apply grammar constraints to the single token
|
|
||||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &single_token_data_array);
|
|
||||||
|
|
||||||
// Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
|
|
||||||
bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
|
|
||||||
|
|
||||||
// If the token is not valid according to the grammar, perform resampling
|
|
||||||
if (!is_valid) {
|
|
||||||
LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
|
|
||||||
|
|
||||||
// Restore logits from the copy
|
|
||||||
std::copy(original_logits.begin(), original_logits.end(), logits);
|
|
||||||
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
|
||||||
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
static llama_token_data_array llama_sampling_prepare_impl(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool apply_grammar,
|
|
||||||
std::vector<float> * original_logits) {
|
|
||||||
const llama_sampling_params & params = ctx_sampling->params;
|
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
|
||||||
|
|
||||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
|
||||||
const float penalty_repeat = params.penalty_repeat;
|
|
||||||
const float penalty_freq = params.penalty_freq;
|
|
||||||
const float penalty_present = params.penalty_present;
|
|
||||||
|
|
||||||
const bool penalize_nl = params.penalize_nl;
|
|
||||||
|
|
||||||
auto & prev = ctx_sampling->prev;
|
|
||||||
auto & cur = ctx_sampling->cur;
|
|
||||||
|
|
||||||
// Get a pointer to the logits
|
|
||||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && !apply_grammar) {
|
|
||||||
GGML_ASSERT(original_logits != NULL);
|
|
||||||
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
|
|
||||||
*original_logits = {logits, logits + n_vocab};
|
|
||||||
}
|
|
||||||
|
|
||||||
// apply params.logit_bias map
|
|
||||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
|
||||||
logits[it->first] += it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx_cfg) {
|
|
||||||
float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
|
|
||||||
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
cur.resize(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array cur_p = { cur.data(), cur.size(), false };
|
|
||||||
|
|
||||||
// apply penalties
|
|
||||||
const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
|
|
||||||
const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
|
|
||||||
if (penalty_tokens_used_size) {
|
|
||||||
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
|
|
||||||
|
|
||||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
|
||||||
penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
|
|
||||||
penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
|
|
||||||
|
|
||||||
if (!penalize_nl) {
|
|
||||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
|
||||||
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
|
|
||||||
cur_p.data[idx].logit = nl_logit;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply grammar checks before sampling logic
|
return samplers;
|
||||||
if (apply_grammar && ctx_sampling->grammar != NULL) {
|
}
|
||||||
llama_grammar_sample(ctx_sampling->grammar, ctx_main, &cur_p);
|
|
||||||
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||||
|
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
||||||
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<gpt_sampler_type> samplers;
|
||||||
|
samplers.reserve(chars.size());
|
||||||
|
|
||||||
|
for (const auto & c : chars) {
|
||||||
|
const auto sampler = sampler_name_map.find(c);
|
||||||
|
if (sampler != sampler_name_map.end()) {
|
||||||
|
samplers.push_back(sampler->second);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur_p;
|
return samplers;
|
||||||
}
|
|
||||||
|
|
||||||
llama_token llama_sampling_sample(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx) {
|
|
||||||
// Call the implementation function with is_resampling set to false by default
|
|
||||||
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array llama_sampling_prepare(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
const int idx,
|
|
||||||
bool apply_grammar,
|
|
||||||
std::vector<float> * original_logits) {
|
|
||||||
return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sampling_accept(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
llama_token id,
|
|
||||||
bool apply_grammar) {
|
|
||||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
|
||||||
ctx_sampling->prev.push_back(id);
|
|
||||||
|
|
||||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
|
||||||
llama_grammar_accept_token(ctx_sampling->grammar, ctx_main, id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -2,159 +2,82 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <random>
|
|
||||||
#include <string>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
// sampler types
|
|
||||||
enum class llama_sampler_type : char {
|
|
||||||
TOP_K = 'k',
|
|
||||||
TOP_P = 'p',
|
|
||||||
MIN_P = 'm',
|
|
||||||
TFS_Z = 'f',
|
|
||||||
TYPICAL_P = 'y',
|
|
||||||
TEMPERATURE = 't'
|
|
||||||
};
|
|
||||||
|
|
||||||
// sampling parameters
|
|
||||||
typedef struct llama_sampling_params {
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> samplers_sequence = {
|
|
||||||
llama_sampler_type::TOP_K,
|
|
||||||
llama_sampler_type::TFS_Z,
|
|
||||||
llama_sampler_type::TYPICAL_P,
|
|
||||||
llama_sampler_type::TOP_P,
|
|
||||||
llama_sampler_type::MIN_P,
|
|
||||||
llama_sampler_type::TEMPERATURE
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
||||||
|
|
||||||
// Classifier-Free Guidance
|
|
||||||
// https://arxiv.org/abs/2306.17806
|
|
||||||
std::string cfg_negative_prompt; // string to help guidance
|
|
||||||
float cfg_scale = 1.f; // how strong is guidance
|
|
||||||
|
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
|
||||||
|
|
||||||
std::vector<llama_token> penalty_prompt_tokens;
|
|
||||||
bool use_penalty_prompt_tokens = false;
|
|
||||||
} llama_sampling_params;
|
|
||||||
|
|
||||||
// general sampler context
|
|
||||||
// TODO: move to llama.h
|
|
||||||
struct llama_sampling_context {
|
|
||||||
// parameters that will be used for sampling
|
|
||||||
llama_sampling_params params;
|
|
||||||
|
|
||||||
// mirostat sampler state
|
|
||||||
float mirostat_mu;
|
|
||||||
|
|
||||||
llama_grammar * grammar;
|
|
||||||
|
|
||||||
// internal
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
|
||||||
|
|
||||||
// TODO: replace with ring-buffer
|
|
||||||
std::vector<llama_token> prev;
|
|
||||||
std::vector<llama_token_data> cur;
|
|
||||||
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
|
||||||
|
|
||||||
std::mt19937 rng;
|
|
||||||
};
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
// Create a new sampling context instance.
|
#include <string>
|
||||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
|
#include <vector>
|
||||||
|
|
||||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
// gpt_sampler extends llama_sampler with additional functionality:
|
||||||
|
|
||||||
// Reset the sampler context
|
|
||||||
// - clear prev tokens
|
|
||||||
// - reset grammar
|
|
||||||
void llama_sampling_reset(llama_sampling_context * ctx);
|
|
||||||
|
|
||||||
// Set the sampler seed
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
|
|
||||||
|
|
||||||
// Copy the sampler context
|
|
||||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
|
||||||
|
|
||||||
// Get the last sampled token
|
|
||||||
llama_token llama_sampling_last(llama_sampling_context * ctx);
|
|
||||||
|
|
||||||
// Get a string representation of the last sampled tokens
|
|
||||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
|
|
||||||
|
|
||||||
// Print sampling parameters into a string
|
|
||||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
|
||||||
|
|
||||||
// Print sampling order into a string
|
|
||||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
|
||||||
|
|
||||||
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
|
|
||||||
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
|
||||||
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
|
|
||||||
|
|
||||||
// this is a common sampling function used across the examples for convenience
|
|
||||||
// it can serve as a starting point for implementing your own sampling function
|
|
||||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
|
||||||
// llama_sampling_reset when a sequence ends
|
|
||||||
//
|
//
|
||||||
// required:
|
// - grammar support
|
||||||
// - ctx_main: context to use for sampling
|
// - custom sampler logic based on the parameters
|
||||||
// - ctx_sampling: sampling-specific context
|
// - history of the last accepted tokens
|
||||||
|
// - performance metrics
|
||||||
//
|
//
|
||||||
// optional:
|
// This goal is to have a common implementation of the sampling logic shared across the examples.
|
||||||
// - ctx_cfg: context to use for classifier-free guidance
|
// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
|
||||||
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
// complex (top-k, top-p, etc).
|
||||||
//
|
//
|
||||||
// returns:
|
// Another example is related to the grammar. In general, the grammar constraints applied on the full
|
||||||
// - token: sampled token
|
// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
|
||||||
// - candidates: vector of candidate tokens
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
||||||
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||||
|
//
|
||||||
|
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||||
|
// be moved into the core llama library.
|
||||||
|
//
|
||||||
|
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
|
||||||
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||||
|
//
|
||||||
|
// TODO: measure grammar performance
|
||||||
//
|
//
|
||||||
llama_token llama_sampling_sample(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
int idx = -1);
|
|
||||||
|
|
||||||
// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
|
struct gpt_sampler;
|
||||||
llama_token_data_array llama_sampling_prepare(
|
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
|
||||||
struct llama_context * ctx_cfg,
|
|
||||||
int idx = 0,
|
|
||||||
bool apply_grammar = true,
|
|
||||||
std::vector<float> * original_logits = nullptr);
|
|
||||||
|
|
||||||
void llama_sampling_accept(
|
// llama_sampler API overloads
|
||||||
struct llama_sampling_context * ctx_sampling,
|
|
||||||
struct llama_context * ctx_main,
|
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
||||||
llama_token id,
|
|
||||||
bool apply_grammar);
|
void gpt_sampler_free(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||||
|
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||||
|
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
||||||
|
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// arguments can be nullptr to skip printing
|
||||||
|
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// extended sampling implementation:
|
||||||
|
//
|
||||||
|
// - set logits
|
||||||
|
// - apply the configured sampler chain
|
||||||
|
// - check if the token fits the grammar (if any)
|
||||||
|
// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
|
||||||
|
//
|
||||||
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||||
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||||
|
//
|
||||||
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// helpers
|
||||||
|
|
||||||
|
// access the internal list of current candidate tokens
|
||||||
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get the last accepted token
|
||||||
|
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// print the sampler chain into a string
|
||||||
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
|
// get a string representation of the last accepted tokens
|
||||||
|
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
|
||||||
|
|
||||||
|
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
|
||||||
|
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
|
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
|
||||||
|
11662
common/stb_image.h
11662
common/stb_image.h
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,11 @@
|
|||||||
#include "train.h"
|
#include "train.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
std::mt19937 gen;
|
std::mt19937 gen;
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import ast
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
@ -63,6 +64,7 @@ class Model:
|
|||||||
model_name: str | None
|
model_name: str | None
|
||||||
metadata_override: Path | None
|
metadata_override: Path | None
|
||||||
dir_model_card: Path
|
dir_model_card: Path
|
||||||
|
is_lora: bool
|
||||||
|
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
@ -70,7 +72,7 @@ class Model:
|
|||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||||
use_temp_file: bool = False, eager: bool = False,
|
use_temp_file: bool = False, eager: bool = False,
|
||||||
metadata_override: Path | None = None, model_name: str | None = None,
|
metadata_override: Path | None = None, model_name: str | None = None,
|
||||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, is_lora: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
|
|
||||||
@ -92,6 +94,7 @@ class Model:
|
|||||||
self.metadata_override = metadata_override
|
self.metadata_override = metadata_override
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py
|
||||||
|
self.is_lora = is_lora # true if model is used inside convert_lora_to_gguf.py
|
||||||
|
|
||||||
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
@ -129,12 +132,14 @@ class Model:
|
|||||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||||
tensor_names_from_parts: set[str] = set()
|
tensor_names_from_parts: set[str] = set()
|
||||||
|
|
||||||
if len(self.part_names) > 1:
|
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
||||||
|
index_name += ".index.json"
|
||||||
|
index_file = self.dir_model / index_name
|
||||||
|
|
||||||
|
if index_file.is_file():
|
||||||
self.tensor_names = set()
|
self.tensor_names = set()
|
||||||
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
|
||||||
index_name += ".index.json"
|
|
||||||
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
||||||
with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
|
with open(index_file, "r", encoding="utf-8") as f:
|
||||||
index: dict[str, Any] = json.load(f)
|
index: dict[str, Any] = json.load(f)
|
||||||
weight_map = index.get("weight_map")
|
weight_map = index.get("weight_map")
|
||||||
if weight_map is None or not isinstance(weight_map, dict):
|
if weight_map is None or not isinstance(weight_map, dict):
|
||||||
@ -142,6 +147,7 @@ class Model:
|
|||||||
self.tensor_names.update(weight_map.keys())
|
self.tensor_names.update(weight_map.keys())
|
||||||
else:
|
else:
|
||||||
self.tensor_names = tensor_names_from_parts
|
self.tensor_names = tensor_names_from_parts
|
||||||
|
weight_map = {}
|
||||||
|
|
||||||
for part_name in self.part_names:
|
for part_name in self.part_names:
|
||||||
logger.info(f"gguf: loading model part '{part_name}'")
|
logger.info(f"gguf: loading model part '{part_name}'")
|
||||||
@ -168,9 +174,17 @@ class Model:
|
|||||||
data = LazyTorchTensor.from_eager(data)
|
data = LazyTorchTensor.from_eager(data)
|
||||||
yield name, data
|
yield name, data
|
||||||
|
|
||||||
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
# verify tensor name presence and identify potentially missing files
|
||||||
if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
||||||
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
||||||
|
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
||||||
|
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
||||||
|
if len(extra) == 0 and len(missing_files) > 0:
|
||||||
|
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
||||||
|
else:
|
||||||
|
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
||||||
|
f"Missing tensors: {missing}\n"
|
||||||
|
f"Extra tensors: {extra}")
|
||||||
|
|
||||||
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
||||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||||
@ -296,12 +310,31 @@ class Model:
|
|||||||
gguf.MODEL_TENSOR.POS_EMBD,
|
gguf.MODEL_TENSOR.POS_EMBD,
|
||||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
or not name.endswith(".weight")
|
or not new_name.endswith(".weight")
|
||||||
):
|
):
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
|
if data_qtype is False and any(
|
||||||
|
self.match_model_tensor_name(new_name, key, bid)
|
||||||
|
for key in (
|
||||||
|
gguf.MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
gguf.MODEL_TENSOR.OUTPUT,
|
||||||
|
)
|
||||||
|
):
|
||||||
|
if self.ftype in (
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
|
):
|
||||||
|
# TODO: use Q4_K and Q6_K
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.F16
|
||||||
|
|
||||||
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
# No override (data_qtype is False), or wants to be quantized (data_qtype is True)
|
||||||
if isinstance(data_qtype, bool):
|
if isinstance(data_qtype, bool):
|
||||||
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
if self.ftype == gguf.LlamaFileType.ALL_F32:
|
||||||
@ -312,6 +345,10 @@ class Model:
|
|||||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
|
||||||
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
data_qtype = gguf.GGMLQuantizationType.Q8_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ1_0
|
||||||
|
elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0:
|
||||||
|
data_qtype = gguf.GGMLQuantizationType.TQ2_0
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
raise ValueError(f"Unknown file type: {self.ftype.name}")
|
||||||
|
|
||||||
@ -600,6 +637,9 @@ class Model:
|
|||||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||||
res = "exaone"
|
res = "exaone"
|
||||||
|
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||||
|
# ref: https://huggingface.co/microsoft/phi-2
|
||||||
|
res = "phi-2"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -1458,7 +1498,7 @@ class StableLMModel(Model):
|
|||||||
raise ValueError(f"Unprocessed norms: {norms}")
|
raise ValueError(f"Unprocessed norms: {norms}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||||
class LlamaModel(Model):
|
class LlamaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
@ -1570,7 +1610,7 @@ class LlamaModel(Model):
|
|||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
@ -1593,7 +1633,8 @@ class LlamaModel(Model):
|
|||||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
|
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||||
|
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
@ -1616,15 +1657,16 @@ class BitnetModel(Model):
|
|||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
def weight_quant(self, weight):
|
def weight_quant(self, weight: Tensor) -> Tensor:
|
||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
scale = weight.abs().mean().clamp(min=1e-5)
|
||||||
weight = (weight * s).round().clamp(-1, 1) / s
|
iscale = 1 / scale
|
||||||
scale = weight.abs().max().unsqueeze(0)
|
# TODO: multiply by the scale directly instead of inverting it twice
|
||||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
# (this is also unnecessarily doubly inverted upstream)
|
||||||
weight = torch.sign(weight).type(dtype)
|
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
||||||
return weight.type(dtype), scale.type(torch.float32)
|
result = (weight * iscale).round().clamp(-1, 1) / iscale
|
||||||
|
return result.type(dtype)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
@ -1639,11 +1681,9 @@ class BitnetModel(Model):
|
|||||||
gguf.MODEL_TENSOR.FFN_GATE,
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
]):
|
]):
|
||||||
# transform weight into 1/0/-1 (in fp32)
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
weight_torch, scale_torch = self.weight_quant(data_torch)
|
data_torch = self.weight_quant(data_torch)
|
||||||
yield (new_name, weight_torch)
|
|
||||||
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
yield (new_name, data_torch)
|
||||||
else:
|
|
||||||
yield (new_name, data_torch)
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
@ -1812,6 +1852,60 @@ class MiniCPMModel(Model):
|
|||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("MiniCPM3ForCausalLM")
|
||||||
|
class MiniCPM3Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.MINICPM3
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
hparams = self.hparams
|
||||||
|
|
||||||
|
rope_dims = hparams["qk_rope_head_dim"]
|
||||||
|
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||||
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||||
|
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||||
|
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||||
|
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||||
|
|
||||||
|
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||||
|
if rope_scaling is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
long_factors = rope_scaling.get('long_factor', None)
|
||||||
|
short_factors = rope_scaling.get('short_factor', None)
|
||||||
|
|
||||||
|
if long_factors is None or short_factors is None:
|
||||||
|
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||||
|
|
||||||
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||||
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||||
|
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_llama_hf()
|
||||||
|
|
||||||
|
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||||
|
if n_kv_head is not None and n_head != n_kv_head:
|
||||||
|
n_head //= n_kv_head
|
||||||
|
|
||||||
|
return (
|
||||||
|
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
|
.swapaxes(1, 2)
|
||||||
|
.reshape(weights.shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("QWenLMHeadModel")
|
@Model.register("QWenLMHeadModel")
|
||||||
class QwenModel(Model):
|
class QwenModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.QWEN
|
model_arch = gguf.MODEL_ARCH.QWEN
|
||||||
@ -2140,8 +2234,9 @@ class Phi3MiniModel(Model):
|
|||||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||||
|
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||||
|
|
||||||
|
|
||||||
@Model.register("PlamoForCausalLM")
|
@Model.register("PlamoForCausalLM")
|
||||||
@ -2712,6 +2807,86 @@ class StarCoder2Model(Model):
|
|||||||
model_arch = gguf.MODEL_ARCH.STARCODER2
|
model_arch = gguf.MODEL_ARCH.STARCODER2
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("Rwkv6ForCausalLM")
|
||||||
|
class Rwkv6Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.RWKV6
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file()
|
||||||
|
vocab_size = self.hparams.get("vocab_size", 65536)
|
||||||
|
|
||||||
|
tokens: list[bytes] = ['<s>'.encode("utf-8")]
|
||||||
|
toktypes: list[int] = [gguf.TokenType.CONTROL]
|
||||||
|
|
||||||
|
with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
for line in lines:
|
||||||
|
parts = line.split(' ')
|
||||||
|
assert len(parts) >= 3
|
||||||
|
token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
|
||||||
|
token = token.encode("utf-8") if isinstance(token, str) else token
|
||||||
|
assert isinstance(token, bytes)
|
||||||
|
assert len(token) == token_len
|
||||||
|
token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff"
|
||||||
|
tokens.append(token_text.encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
remainder = vocab_size - len(tokens)
|
||||||
|
assert remainder >= 0
|
||||||
|
for i in range(len(tokens), vocab_size):
|
||||||
|
tokens.append(f"[PAD{i}]".encode("utf-8"))
|
||||||
|
toktypes.append(gguf.TokenType.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
block_count = self.hparams["num_hidden_layers"]
|
||||||
|
head_size = self.hparams["head_size"]
|
||||||
|
hidden_size = self.hparams["hidden_size"]
|
||||||
|
layer_norm_eps = self.hparams["layer_norm_epsilon"]
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32)
|
||||||
|
time_mix_extra_dim = 64 if hidden_size == 4096 else 32
|
||||||
|
time_decay_extra_dim = 128 if hidden_size == 4096 else 64
|
||||||
|
|
||||||
|
# RWKV isn't context limited
|
||||||
|
self.gguf_writer.add_context_length(1048576)
|
||||||
|
self.gguf_writer.add_embedding_length(hidden_size)
|
||||||
|
self.gguf_writer.add_block_count(block_count)
|
||||||
|
self.gguf_writer.add_layer_norm_eps(layer_norm_eps)
|
||||||
|
self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers)
|
||||||
|
self.gguf_writer.add_wkv_head_size(head_size)
|
||||||
|
self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim)
|
||||||
|
self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim)
|
||||||
|
self.gguf_writer.add_feed_forward_length(intermediate_size)
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
# required by llama.cpp, unused
|
||||||
|
self.gguf_writer.add_head_count(0)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if not (new_name.endswith(".weight") or new_name.endswith(".bias")):
|
||||||
|
new_name += ".weight"
|
||||||
|
|
||||||
|
if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"):
|
||||||
|
data_torch = data_torch.transpose(0, 1)
|
||||||
|
|
||||||
|
if new_name.endswith("time_mix_w2.weight"):
|
||||||
|
data_torch = data_torch.permute(0, 2, 1)
|
||||||
|
|
||||||
|
rescale_every_n_layers = self.hparams["rescale_every"]
|
||||||
|
if rescale_every_n_layers > 0:
|
||||||
|
if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"):
|
||||||
|
data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers))
|
||||||
|
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
|
||||||
class MambaModel(Model):
|
class MambaModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MAMBA
|
model_arch = gguf.MODEL_ARCH.MAMBA
|
||||||
@ -2905,6 +3080,66 @@ class OlmoModel(Model):
|
|||||||
return [(self.map_tensor_name(name), data_torch)]
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("OlmoeForCausalLM")
|
||||||
|
class OlmoeModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.OLMOE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||||
|
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||||
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
# Copied from: Qwen2MoeModel
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("experts") != -1:
|
||||||
|
n_experts = self.hparams["num_experts"]
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
# Copied from: Qwen2MoeModel
|
||||||
|
def prepare_tensors(self):
|
||||||
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
||||||
class JinaBertV2Model(BertModel):
|
class JinaBertV2Model(BertModel):
|
||||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||||
@ -3887,7 +4122,7 @@ class ExaoneModel(Model):
|
|||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
@ -3910,11 +4145,42 @@ class ExaoneModel(Model):
|
|||||||
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
|
||||||
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
|
||||||
|
|
||||||
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
if not self.is_lora:
|
||||||
|
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
|
||||||
|
|
||||||
super().prepare_tensors()
|
super().prepare_tensors()
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("GraniteForCausalLM")
|
||||||
|
class GraniteModel(LlamaModel):
|
||||||
|
"""Conversion for IBM's GraniteForCausalLM"""
|
||||||
|
model_arch = gguf.MODEL_ARCH.GRANITE
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
"""Granite uses standard llama parameters with the following differences:
|
||||||
|
|
||||||
|
- No head_dim support
|
||||||
|
- New multiplier params:
|
||||||
|
- attention_scale
|
||||||
|
- embedding_scale
|
||||||
|
- residual_scale
|
||||||
|
- logits_scaling
|
||||||
|
"""
|
||||||
|
if head_dim := self.hparams.pop("head_dim", None):
|
||||||
|
logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
# NOTE: Convert _multiplier params to _scale params for naming
|
||||||
|
# consistency
|
||||||
|
if attention_scale := self.hparams.get("attention_multiplier"):
|
||||||
|
self.gguf_writer.add_attention_scale(attention_scale)
|
||||||
|
if embedding_scale := self.hparams.get("embedding_multiplier"):
|
||||||
|
self.gguf_writer.add_embedding_scale(embedding_scale)
|
||||||
|
if residual_scale := self.hparams.get("residual_multiplier"):
|
||||||
|
self.gguf_writer.add_residual_scale(residual_scale)
|
||||||
|
if logits_scaling := self.hparams.get("logits_scaling"):
|
||||||
|
self.gguf_writer.add_logit_scale(logits_scaling)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
# tree of lazy tensors
|
# tree of lazy tensors
|
||||||
@ -3995,8 +4261,8 @@ def parse_args() -> argparse.Namespace:
|
|||||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
|
"--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16",
|
||||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--bigendian", action="store_true",
|
"--bigendian", action="store_true",
|
||||||
@ -4083,6 +4349,8 @@ def main() -> None:
|
|||||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||||
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
"q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
|
||||||
|
"tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0,
|
||||||
|
"tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0,
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,6 +31,7 @@ import re
|
|||||||
import requests
|
import requests
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
|
import shutil
|
||||||
|
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from enum import IntEnum, auto
|
from enum import IntEnum, auto
|
||||||
@ -97,6 +98,7 @@ models = [
|
|||||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||||
|
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@ -125,12 +127,27 @@ def download_model(model):
|
|||||||
if tokt == TOKENIZER_TYPE.UGM:
|
if tokt == TOKENIZER_TYPE.UGM:
|
||||||
files.append("spiece.model")
|
files.append("spiece.model")
|
||||||
|
|
||||||
for file in files:
|
if os.path.isdir(repo):
|
||||||
save_path = f"models/tokenizers/{name}/{file}"
|
# If repo is a path on the file system, copy the directory
|
||||||
if os.path.isfile(save_path):
|
for file in files:
|
||||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
src_path = os.path.join(repo, file)
|
||||||
continue
|
dst_path = f"models/tokenizers/{name}/{file}"
|
||||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
if os.path.isfile(dst_path):
|
||||||
|
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
||||||
|
continue
|
||||||
|
if os.path.isfile(src_path):
|
||||||
|
shutil.copy2(src_path, dst_path)
|
||||||
|
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"{name}: Source file {src_path} does not exist")
|
||||||
|
else:
|
||||||
|
# If repo is a URL, download the files
|
||||||
|
for file in files:
|
||||||
|
save_path = f"models/tokenizers/{name}/{file}"
|
||||||
|
if os.path.isfile(save_path):
|
||||||
|
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||||
|
continue
|
||||||
|
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||||
|
|
||||||
|
|
||||||
for model in models:
|
for model in models:
|
||||||
|
@ -363,7 +363,13 @@ if __name__ == '__main__':
|
|||||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
dest = super().modify_tensors(data_torch, name, bid)
|
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||||
|
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||||
|
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||||
|
# therefore, we ignore them for now
|
||||||
|
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||||
|
if name == "lm_head.weight" and len(dest) == 0:
|
||||||
|
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||||
for dest_name, dest_data in dest:
|
for dest_name, dest_data in dest:
|
||||||
assert isinstance(dest_data, LoraTorchTensor)
|
assert isinstance(dest_data, LoraTorchTensor)
|
||||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||||
@ -386,6 +392,7 @@ if __name__ == '__main__':
|
|||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
dir_lora_model=dir_lora,
|
dir_lora_model=dir_lora,
|
||||||
lora_alpha=alpha,
|
lora_alpha=alpha,
|
||||||
|
is_lora=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Exporting model...")
|
logger.info("Exporting model...")
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
|
||||||
|
|
||||||
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
|
||||||
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
|
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
|
||||||
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
|
||||||
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
|
||||||
|
|
||||||
@ -28,10 +28,6 @@
|
|||||||
|
|
||||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
||||||
|
|
||||||
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
|
|
||||||
|
|
||||||
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
|
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
The SYCL backend would be broken by some PRs due to no online CI.
|
The SYCL backend would be broken by some PRs due to no online CI.
|
||||||
@ -45,6 +41,10 @@ The following release is verified with good quality:
|
|||||||
|
|
||||||
## News
|
## News
|
||||||
|
|
||||||
|
|
||||||
|
- 2024.8
|
||||||
|
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
|
||||||
|
|
||||||
- 2024.5
|
- 2024.5
|
||||||
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
- Performance is increased: 34 -> 37 tokens/s of llama-2-7b.Q4_0 on Arc770.
|
||||||
- Arch Linux is verified successfully.
|
- Arch Linux is verified successfully.
|
||||||
@ -196,7 +196,7 @@ Please follow the instructions for downloading and installing the Toolkit for Li
|
|||||||
|
|
||||||
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
|
||||||
|
|
||||||
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
|
Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.
|
||||||
|
|
||||||
- **Adding support to Nvidia GPUs**
|
- **Adding support to Nvidia GPUs**
|
||||||
|
|
||||||
@ -255,8 +255,6 @@ or
|
|||||||
# Export relevant ENV variables
|
# Export relevant ENV variables
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
@ -338,12 +336,12 @@ Choose one of following methods to run.
|
|||||||
- Use device 0:
|
- Use device 0:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run_llama2.sh 0
|
./examples/sycl/run-llama2.sh 0
|
||||||
```
|
```
|
||||||
- Use multiple devices:
|
- Use multiple devices:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
./examples/sycl/run_llama2.sh
|
./examples/sycl/run-llama2.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Command line
|
2. Command line
|
||||||
@ -638,6 +636,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|
|
||||||
It's same for other projects including llama.cpp SYCL backend.
|
It's same for other projects including llama.cpp SYCL backend.
|
||||||
|
|
||||||
|
- Meet issue: `Native API failed. Native API returns: -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -6 (PI_ERROR_OUT_OF_HOST_MEMORY) -999 (UNKNOWN PI error)` or `failed to allocate SYCL0 buffer`
|
||||||
|
|
||||||
|
Device Memory is not enough.
|
||||||
|
|
||||||
|
|Reason|Solution|
|
||||||
|
|-|-|
|
||||||
|
|Default Context is too big. It leads to more memory usage.|Set `-c 8192` or smaller value.|
|
||||||
|
|Model is big and require more memory than device's.|Choose smaller quantized model, like Q5 -> Q4;<br>Use more than one devices to load model.|
|
||||||
|
|
||||||
### **GitHub contribution**:
|
### **GitHub contribution**:
|
||||||
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
|
||||||
|
@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
|
|||||||
### Android
|
### Android
|
||||||
|
|
||||||
To read documentation for how to build on Android, [click here](./android.md)
|
To read documentation for how to build on Android, [click here](./android.md)
|
||||||
|
|
||||||
|
### Arm CPU optimized mulmat kernels
|
||||||
|
|
||||||
|
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||||
|
|
||||||
|
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||||
|
@ -20,7 +20,7 @@ Additionally, there the following images, similar to the above:
|
|||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
|
|||||||
|
|
||||||
The defaults are:
|
The defaults are:
|
||||||
|
|
||||||
- `CUDA_VERSION` set to `11.7.1`
|
- `CUDA_VERSION` set to `12.6.0`
|
||||||
- `CUDA_DOCKER_ARCH` set to `all`
|
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
|
@ -49,3 +49,12 @@ There are 2 modes of operation:
|
|||||||
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
||||||
|
|
||||||
|
### JSONL output
|
||||||
|
|
||||||
|
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
|
||||||
|
```
|
||||||
|
@ -1,49 +1,28 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// mutates the input string
|
static void print_usage(int, char ** argv) {
|
||||||
static std::vector<int> parse_list(char * p) {
|
LOG("\nexample usage:\n");
|
||||||
std::vector<int> ret;
|
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||||
|
LOG("\n");
|
||||||
char * q = p;
|
|
||||||
|
|
||||||
while (*p) {
|
|
||||||
if (*p == ',') {
|
|
||||||
*p = '\0';
|
|
||||||
ret.push_back(std::atoi(q));
|
|
||||||
q = p + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
++p;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.push_back(std::atoi(q));
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
LOG_TEE("\nexample usage:\n");
|
|
||||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
|
||||||
LOG_TEE("\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
int is_pp_shared = params.is_pp_shared;
|
int is_pp_shared = params.is_pp_shared;
|
||||||
|
|
||||||
std::vector<int> n_pp = params.n_pp;
|
std::vector<int> n_pp = params.n_pp;
|
||||||
@ -100,7 +79,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,17 +96,18 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
if (!params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG("\n");
|
||||||
LOG_TEE("\n");
|
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
|
LOG("\n");
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
@ -156,7 +136,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,7 +158,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -195,12 +175,22 @@ int main(int argc, char ** argv) {
|
|||||||
const float speed_tg = pl*tg / t_tg;
|
const float speed_tg = pl*tg / t_tg;
|
||||||
const float speed = n_kv / t;
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
if(params.batched_bench_output_jsonl) {
|
||||||
|
LOG(
|
||||||
|
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||||
|
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||||
|
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||||
|
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
@ -209,7 +199,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -27,7 +27,6 @@ guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), mo
|
|||||||
print("Failed to load model")
|
print("Failed to load model")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
}
|
}
|
||||||
@ -37,7 +36,6 @@ var tokens = tokenize(text: prompt, add_bos: true)
|
|||||||
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
||||||
|
|
||||||
var context_params = llama_context_default_params()
|
var context_params = llama_context_default_params()
|
||||||
context_params.seed = 1234
|
|
||||||
context_params.n_ctx = n_kv_req
|
context_params.n_ctx = n_kv_req
|
||||||
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||||
context_params.n_threads = 8
|
context_params.n_threads = 8
|
||||||
@ -48,11 +46,26 @@ guard context != nil else {
|
|||||||
print("Failed to initialize context")
|
print("Failed to initialize context")
|
||||||
exit(1)
|
exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
defer {
|
defer {
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var sparams = llama_sampler_chain_default_params()
|
||||||
|
|
||||||
|
let smpl = llama_sampler_chain_init(sparams)
|
||||||
|
guard smpl != nil else {
|
||||||
|
print("Failed to initialize sampling")
|
||||||
|
exit(1)
|
||||||
|
}
|
||||||
|
defer {
|
||||||
|
llama_sampler_free(smpl)
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(40));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.4));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (1234));
|
||||||
|
|
||||||
let n_ctx = llama_n_ctx(context)
|
let n_ctx = llama_n_ctx(context)
|
||||||
|
|
||||||
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||||
@ -125,32 +138,7 @@ while n_cur <= n_len {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
var n_vocab = llama_n_vocab(model)
|
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
||||||
var logits = llama_get_logits_ith(context, i_batch[i])
|
|
||||||
|
|
||||||
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
|
||||||
|
|
||||||
for token_id in 0 ..< n_vocab {
|
|
||||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
|
||||||
}
|
|
||||||
|
|
||||||
var candidates_p: llama_token_data_array = .init(
|
|
||||||
data: &candidates,
|
|
||||||
size: candidates.count,
|
|
||||||
sorted: false
|
|
||||||
)
|
|
||||||
|
|
||||||
let top_k: Int32 = 40
|
|
||||||
let top_p: Float = 0.9
|
|
||||||
let temp: Float = 0.4
|
|
||||||
|
|
||||||
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
|
||||||
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
|
||||||
llama_sample_temp(context, &candidates_p, temp)
|
|
||||||
|
|
||||||
let new_token_id = llama_sample_token(context, &candidates_p)
|
|
||||||
|
|
||||||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
@ -210,9 +198,10 @@ if n_parallel > 1 {
|
|||||||
|
|
||||||
let t_main_end = ggml_time_us()
|
let t_main_end = ggml_time_us()
|
||||||
|
|
||||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||||
|
|
||||||
llama_print_timings(context)
|
llama_perf_sampler_print(smpl)
|
||||||
|
llama_perf_context_print(context)
|
||||||
|
|
||||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||||
let utf8Count = text.utf8.count
|
let utf8Count = text.utf8.count
|
||||||
|
@ -1,18 +1,17 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
LOG("\nexample usage:\n");
|
||||||
|
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
|
||||||
LOG_TEE("\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -21,11 +20,11 @@ int main(int argc, char ** argv) {
|
|||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// number of parallel batches
|
// number of parallel batches
|
||||||
int n_parallel = params.n_parallel;
|
int n_parallel = params.n_parallel;
|
||||||
@ -45,7 +44,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -65,32 +64,39 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// create a llama_batch
|
// create a llama_batch
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
||||||
@ -108,7 +114,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
if (llama_encode(ctx, batch)) {
|
if (llama_encode(ctx, batch)) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,7 +131,7 @@ int main(int argc, char ** argv) {
|
|||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,7 +142,7 @@ int main(int argc, char ** argv) {
|
|||||||
//}
|
//}
|
||||||
|
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// main loop
|
// main loop
|
||||||
@ -164,36 +170,14 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
||||||
auto * logits = llama_get_logits_ith(ctx, i_batch[i]);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
const int top_k = 40;
|
|
||||||
const float top_p = 0.9f;
|
|
||||||
const float temp = 0.4f;
|
|
||||||
|
|
||||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
|
||||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
|
||||||
llama_sample_temp (ctx, &candidates_p, temp);
|
|
||||||
|
|
||||||
const llama_token new_token_id = llama_sample_token(ctx, &candidates_p);
|
|
||||||
|
|
||||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
i_batch[i] = -1;
|
i_batch[i] = -1;
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
@ -201,8 +185,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// if there is only one stream, we print immediately to stdout
|
// if there is only one stream, we print immediately to stdout
|
||||||
if (n_parallel == 1) {
|
if (n_parallel == 1) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||||
@ -224,32 +207,33 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
// evaluate the current batch with the transformer model
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
|
||||||
|
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_sampler_print(smpl);
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
|||||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
||||||
|
|
||||||
struct benchmark_params_struct {
|
struct benchmark_params_struct {
|
||||||
int32_t n_threads = 1;
|
int n_threads = 1;
|
||||||
int32_t n_iterations = 10;
|
int32_t n_iterations = 10;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf->nodes[0]);
|
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||||
|
|
||||||
@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||||
printf("=====================================================================================\n");
|
printf("=====================================================================================\n");
|
||||||
@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
// Check that the matrix multiplication result is in the right ballpark
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
#include <cinttypes>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
|
|||||||
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
||||||
try {
|
try {
|
||||||
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
|
|
||||||
w->rms_att_weight.resize(p->n_layers * p->dim);
|
w->rms_att_weight.resize(p->n_layers * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||||
|
|
||||||
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||||
|
|
||||||
w->wq.resize(p->n_layers * p->dim * p->dim);
|
w->wq.resize(p->n_layers * p->dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||||
|
|
||||||
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
|
|
||||||
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
|
|
||||||
w->wo.resize(p->n_layers * p->dim * p->dim);
|
w->wo.resize(p->n_layers * p->dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||||
|
|
||||||
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->rms_final_weight.resize(p->dim);
|
w->rms_final_weight.resize(p->dim);
|
||||||
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||||
|
|
||||||
if (shared_weights) {
|
if (shared_weights) {
|
||||||
w->wcls = {};
|
w->wcls = {};
|
||||||
} else {
|
} else {
|
||||||
w->wcls.resize(p->vocab_size * p->dim);
|
w->wcls.resize(p->vocab_size * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (std::length_error &) {
|
catch (std::length_error &) {
|
||||||
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|||||||
fseek(f, 0, SEEK_END);
|
fseek(f, 0, SEEK_END);
|
||||||
auto end = ftell(f);
|
auto end = ftell(f);
|
||||||
if (curr != end) {
|
if (curr != end) {
|
||||||
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void print_sample_weights(TransformerWeights *w){
|
static void print_sample_weights(TransformerWeights *w){
|
||||||
LOG("----- Quick print of first of the weight vales of all the variables\n");
|
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
|
||||||
LOG("%f\n", w->token_embedding_table[0]);
|
LOG_INF("%f\n", w->token_embedding_table[0]);
|
||||||
LOG("%f\n", w->rms_att_weight[0]);
|
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||||
LOG("%f\n", w->rms_ffn_weight[0]);
|
LOG_INF("%f\n", w->rms_ffn_weight[0]);
|
||||||
|
|
||||||
LOG("%f\n", w->wq[0]);
|
LOG_INF("%f\n", w->wq[0]);
|
||||||
LOG("%f\n", w->wk[0]);
|
LOG_INF("%f\n", w->wk[0]);
|
||||||
LOG("%f\n", w->wv[0]);
|
LOG_INF("%f\n", w->wv[0]);
|
||||||
LOG("%f\n", w->wo[0]);
|
LOG_INF("%f\n", w->wo[0]);
|
||||||
LOG("%f\n", w->w1[0]);
|
LOG_INF("%f\n", w->w1[0]);
|
||||||
LOG("%f\n", w->w2[0]);
|
LOG_INF("%f\n", w->w2[0]);
|
||||||
LOG("%f\n", w->w3[0]);
|
LOG_INF("%f\n", w->w3[0]);
|
||||||
LOG("%f\n", w->rms_att_weight[0]);
|
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||||
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
|
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@ -318,20 +319,20 @@ struct train_params {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||||
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||||
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
|
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||||
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
|
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||||
LOG("%s: n_head: %u\n", __func__, params->n_head);
|
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
|
||||||
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||||
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
|
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||||
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
|
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||||
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
|
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
LOG("%s: Allocating ", __func__);
|
LOG_INF("%s: Allocating ", __func__);
|
||||||
int64_t total = 1;
|
int64_t total = 1;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < ggml_n_dims(t); ++i) {
|
for (; i < ggml_n_dims(t); ++i) {
|
||||||
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
|||||||
|
|
||||||
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
||||||
if (is_ggml_file(filename)) {
|
if (is_ggml_file(filename)) {
|
||||||
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
|||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
} else {
|
} else {
|
||||||
// assume llama2.c vocabulary
|
// assume llama2.c vocabulary
|
||||||
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
if (!file.fp) {
|
if (!file.fp) {
|
||||||
die_fmt("%s: %s", strerror(errno), filename);
|
die_fmt("%s: %s", strerror(errno), filename);
|
||||||
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
if (!params_parse(argc, argv, ¶ms)) {
|
if (!params_parse(argc, argv, ¶ms)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
log_set_target(stdout);
|
|
||||||
Config config;
|
Config config;
|
||||||
TransformerWeights weights = {};
|
TransformerWeights weights = {};
|
||||||
{
|
{
|
||||||
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||||
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
||||||
if (!file) {
|
if (!file) {
|
||||||
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
// read in the config header
|
// read in the config header
|
||||||
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
||||||
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto shared_weights = config.vocab_size > 0;
|
auto shared_weights = config.vocab_size > 0;
|
||||||
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
|
|||||||
// read in the Transformer weights
|
// read in the Transformer weights
|
||||||
alloc_weights(&weights, &config, shared_weights);
|
alloc_weights(&weights, &config, shared_weights);
|
||||||
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
||||||
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fclose(file);
|
fclose(file);
|
||||||
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
|
|||||||
model.name = basename(params.fn_llama2c_model);
|
model.name = basename(params.fn_llama2c_model);
|
||||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||||
|
|
||||||
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -12,14 +13,15 @@
|
|||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <climits>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
@ -35,9 +37,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
@ -390,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -486,8 +485,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (use_pca) {
|
if (use_pca) {
|
||||||
// run PCA
|
// run PCA
|
||||||
PCA::pca_params pca_params;
|
PCA::pca_params pca_params;
|
||||||
pca_params.n_threads = params.n_threads;
|
pca_params.n_threads = params.cpuparams.n_threads;
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
} else {
|
} else {
|
||||||
|
@ -12,12 +12,9 @@
|
|||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
|
|
||||||
#define DEBUG_POS 5
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
@ -229,8 +226,8 @@ static ggml_status compute_piter(
|
|||||||
result.eigenvectors.resize(params.n_batch);
|
result.eigenvectors.resize(params.n_batch);
|
||||||
result.distances.resize(params.n_batch);
|
result.distances.resize(params.n_batch);
|
||||||
// get output nodes
|
// get output nodes
|
||||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||||
auto node = gf->nodes[i];
|
auto node = ggml_graph_node(gf, i);
|
||||||
int iter = -1;
|
int iter = -1;
|
||||||
// find b_tensor (without copying data from device)
|
// find b_tensor (without copying data from device)
|
||||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
@ -38,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||||
// encoder-only model
|
// encoder-only model
|
||||||
if (llama_encode(ctx, batch) < 0) {
|
if (llama_encode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to encode\n", __func__);
|
LOG_ERR("%s : failed to encode\n", __func__);
|
||||||
}
|
}
|
||||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||||
// decoder-only model
|
// decoder-only model
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
LOG_ERR("%s : failed to decode\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,25 +81,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
// For non-causal models, batch size must be equal to ubatch size
|
// For non-causal models, batch size must be equal to ubatch size
|
||||||
params.n_ubatch = params.n_batch;
|
params.n_ubatch = params.n_batch;
|
||||||
|
|
||||||
print_build_info();
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -107,7 +100,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,19 +110,19 @@ int main(int argc, char ** argv) {
|
|||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
|
||||||
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||||
fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
@ -144,7 +137,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -155,20 +148,20 @@ int main(int argc, char ** argv) {
|
|||||||
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||||
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
||||||
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,57 +212,57 @@ int main(int argc, char ** argv) {
|
|||||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
if (params.embd_out.empty()) {
|
if (params.embd_out.empty()) {
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
for (int j = 0; j < n_embd_count; j++) {
|
for (int j = 0; j < n_embd_count; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
for (int i = 0; i < std::min(3, n_embd); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, " ... ");
|
LOG(" ... ");
|
||||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// print cosine similarity matrix
|
// print cosine similarity matrix
|
||||||
if (n_prompts > 1) {
|
if (n_prompts > 1) {
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
printf("cosine similarity matrix:\n\n");
|
LOG("cosine similarity matrix:\n\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
LOG("%6.6s ", prompts[i].c_str());
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
LOG("%6.2f ", sim);
|
||||||
}
|
}
|
||||||
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
LOG("%1.10s", prompts[i].c_str());
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -278,43 +271,45 @@ int main(int argc, char ** argv) {
|
|||||||
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
const bool notArray = params.embd_out != "array";
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
for (int j = 0;;) { // at least one iteration (one prompt)
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
fprintf(stdout, "[");
|
LOG("[");
|
||||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
i++;
|
i++;
|
||||||
if (i < n_embd) fprintf(stdout, ","); else break;
|
if (i < n_embd) LOG(","); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, notArray ? "]\n }" : "]");
|
LOG(notArray ? "]\n }" : "]");
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
LOG(notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
if (params.embd_out == "json+" && n_prompts > 1) {
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
LOG(",\n \"cosineSimilarity\": [\n");
|
||||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
fprintf(stdout, " [");
|
LOG(" [");
|
||||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f", sim);
|
LOG("%6.2f", sim);
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) fprintf(stdout, ", "); else break;
|
if (j < n_embd_count) LOG(", "); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, " ]");
|
LOG(" ]");
|
||||||
i++;
|
i++;
|
||||||
if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
|
if (i < n_embd_count) LOG(",\n"); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n ]");
|
LOG("\n ]");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (notArray) fprintf(stdout, "\n}\n");
|
if (notArray) LOG("\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_print_timings(ctx);
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <random>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
GGML_ASSERT(n > 0);
|
GGML_ASSERT(n > 0);
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
printf(" [\n");
|
LOG(" [\n");
|
||||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||||
if (i2 == n && ne[2] > 2*n) {
|
if (i2 == n && ne[2] > 2*n) {
|
||||||
printf(" ..., \n");
|
LOG(" ..., \n");
|
||||||
i2 = ne[2] - n;
|
i2 = ne[2] - n;
|
||||||
}
|
}
|
||||||
printf(" [\n");
|
LOG(" [\n");
|
||||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||||
if (i1 == n && ne[1] > 2*n) {
|
if (i1 == n && ne[1] > 2*n) {
|
||||||
printf(" ..., \n");
|
LOG(" ..., \n");
|
||||||
i1 = ne[1] - n;
|
i1 = ne[1] - n;
|
||||||
}
|
}
|
||||||
printf(" [");
|
LOG(" [");
|
||||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||||
if (i0 == n && ne[0] > 2*n) {
|
if (i0 == n && ne[0] > 2*n) {
|
||||||
printf("..., ");
|
LOG("..., ");
|
||||||
i0 = ne[0] - n;
|
i0 = ne[0] - n;
|
||||||
}
|
}
|
||||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
printf("%12.4f", v);
|
LOG("%12.4f", v);
|
||||||
sum += v;
|
sum += v;
|
||||||
if (i0 < ne[0] - 1) printf(", ");
|
if (i0 < ne[0] - 1) LOG(", ");
|
||||||
}
|
}
|
||||||
printf("],\n");
|
LOG("],\n");
|
||||||
}
|
}
|
||||||
printf(" ],\n");
|
LOG(" ],\n");
|
||||||
}
|
}
|
||||||
printf(" ]\n");
|
LOG(" ]\n");
|
||||||
printf(" sum = %f\n", sum);
|
LOG(" sum = %f\n", sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|||||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||||
src0->name, ggml_ne_string(src0).c_str(),
|
src0->name, ggml_ne_string(src0).c_str(),
|
||||||
src1 ? src1_str : "",
|
src1 ? src1_str : "",
|
||||||
ggml_ne_string(t).c_str());
|
ggml_ne_string(t).c_str());
|
||||||
|
|
||||||
|
|
||||||
// copy the data from the GPU memory if needed
|
// copy the data from the GPU memory if needed
|
||||||
@ -132,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
|||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
gpt_init();
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -168,14 +165,15 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OK = run(ctx, params);
|
bool OK = run(ctx, params);
|
||||||
@ -183,7 +181,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
@ -369,7 +370,7 @@ struct lora_merge_ctx {
|
|||||||
|
|
||||||
// write data to output file
|
// write data to output file
|
||||||
{
|
{
|
||||||
auto result = gf->nodes[gf->n_nodes - 1];
|
auto * result = ggml_graph_node(gf, -1);
|
||||||
size_t len = ggml_nbytes(result);
|
size_t len = ggml_nbytes(result);
|
||||||
if (read_buf.size() < len) {
|
if (read_buf.size() < len) {
|
||||||
read_buf.resize(len);
|
read_buf.resize(len);
|
||||||
@ -391,9 +392,7 @@ struct lora_merge_ctx {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
printf("\n %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
|
||||||
printf("\nNOTE: output model is F16\n");
|
printf("\nNOTE: output model is F16\n");
|
||||||
@ -403,14 +402,13 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
g_verbose = (params.verbosity == 1);
|
g_verbose = (params.verbosity > 1);
|
||||||
try {
|
try {
|
||||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||||
ctx.run_merge();
|
ctx.run_merge();
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s\n", err.what());
|
fprintf(stderr, "%s\n", err.what());
|
||||||
|
@ -1,9 +1,5 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
|
|
||||||
#include "grammar-parser.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
|
#include "llama-grammar.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -12,29 +8,28 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
|
static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
|
||||||
auto decoded = decode_utf8(input_str, {});
|
const auto cpts = unicode_cpts_from_utf8(input_str);
|
||||||
const auto & code_points = decoded.first;
|
|
||||||
|
|
||||||
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
const llama_grammar_rules & rules = llama_grammar_get_rules (grammar);
|
||||||
llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
|
llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
|
||||||
|
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (const auto & cpt : cpts) {
|
||||||
const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
|
const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
|
||||||
|
|
||||||
llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
|
llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
|
||||||
|
|
||||||
if (cur_stacks.empty()) {
|
if (stacks_cur.empty()) {
|
||||||
error_pos = pos;
|
error_pos = pos;
|
||||||
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
|
error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
|
||||||
cur_stacks = prev_stacks;
|
stacks_cur = stacks_prev;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
++pos;
|
++pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (const auto & stack : cur_stacks) {
|
for (const auto & stack : stacks_cur) {
|
||||||
if (stack.empty()) {
|
if (stack.empty()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -85,27 +80,7 @@ int main(int argc, char** argv) {
|
|||||||
grammar_str = buffer.str();
|
grammar_str = buffer.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse the GBNF grammar
|
llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
|
||||||
auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
|
|
||||||
|
|
||||||
// will be empty (default) if there are parse errors
|
|
||||||
if (parsed_grammar.rules.empty()) {
|
|
||||||
fprintf(stdout, "%s: failed to parse grammar\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure that there is a "root" node.
|
|
||||||
if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) {
|
|
||||||
fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
|
||||||
|
|
||||||
// Create the LLAMA grammar
|
|
||||||
auto grammar = llama_grammar_init(
|
|
||||||
grammar_rules.data(),
|
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
|
||||||
if (grammar == nullptr) {
|
if (grammar == nullptr) {
|
||||||
throw std::runtime_error("Failed to initialize llama_grammar");
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
}
|
}
|
||||||
@ -122,7 +97,7 @@ int main(int argc, char** argv) {
|
|||||||
// Validate the input string against the grammar
|
// Validate the input string against the grammar
|
||||||
size_t error_pos;
|
size_t error_pos;
|
||||||
std::string error_msg;
|
std::string error_msg;
|
||||||
bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg);
|
bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
|
||||||
|
|
||||||
if (is_valid) {
|
if (is_valid) {
|
||||||
fprintf(stdout, "Input string is valid according to the grammar.\n");
|
fprintf(stdout, "Input string is valid according to the grammar.\n");
|
||||||
@ -131,7 +106,7 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Clean up
|
// Clean up
|
||||||
llama_grammar_free(grammar);
|
llama_grammar_free_impl(grammar);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
5
examples/gen-docs/CMakeLists.txt
Normal file
5
examples/gen-docs/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
set(TARGET llama-gen-docs)
|
||||||
|
add_executable(${TARGET} gen-docs.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
52
examples/gen-docs/gen-docs.cpp
Normal file
52
examples/gen-docs/gen-docs.cpp
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Export usage message (-h) to markdown format
|
||||||
|
|
||||||
|
static void export_md(std::string fname, llama_example ex) {
|
||||||
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||||
|
|
||||||
|
file << "| Argument | Explanation |\n";
|
||||||
|
file << "| -------- | ----------- |\n";
|
||||||
|
for (auto & opt : ctx_arg.options) {
|
||||||
|
file << "| `";
|
||||||
|
// args
|
||||||
|
for (const auto & arg : opt.args) {
|
||||||
|
if (arg == opt.args.front()) {
|
||||||
|
file << arg;
|
||||||
|
if (opt.args.size() > 1) file << ", ";
|
||||||
|
} else {
|
||||||
|
file << arg << (arg != opt.args.back() ? ", " : "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// value hint
|
||||||
|
if (opt.value_hint) {
|
||||||
|
std::string md_value_hint(opt.value_hint);
|
||||||
|
string_replace_all(md_value_hint, "|", "\\|");
|
||||||
|
file << " " << md_value_hint;
|
||||||
|
}
|
||||||
|
if (opt.value_hint_2) {
|
||||||
|
std::string md_value_hint_2(opt.value_hint_2);
|
||||||
|
string_replace_all(md_value_hint_2, "|", "\\|");
|
||||||
|
file << " " << md_value_hint_2;
|
||||||
|
}
|
||||||
|
// help text
|
||||||
|
std::string md_help(opt.help);
|
||||||
|
string_replace_all(md_help, "\n", "<br/>");
|
||||||
|
string_replace_all(md_help, "|", "\\|");
|
||||||
|
file << "` | " << md_help << " |\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int, char **) {
|
||||||
|
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
|
||||||
|
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
|||||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc - arg_idx < 2) {
|
if (argc - arg_idx != 2) {
|
||||||
throw std::invalid_argument("error: bad arguments");
|
throw std::invalid_argument("error: bad arguments");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) {
|
|||||||
int n_split = 1;
|
int n_split = 1;
|
||||||
int total_tensors = 0;
|
int total_tensors = 0;
|
||||||
|
|
||||||
auto * ctx_out = gguf_init_empty();
|
// avoid overwriting existing output file
|
||||||
|
if (std::ifstream(split_params.output.c_str())) {
|
||||||
|
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
|
||||||
|
auto * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
std::vector<uint8_t> read_data;
|
std::vector<uint8_t> read_data;
|
||||||
std::vector<ggml_context *> ctx_metas;
|
std::vector<ggml_context *> ctx_metas;
|
||||||
std::vector<gguf_context *> ctx_ggufs;
|
std::vector<gguf_context *> ctx_ggufs;
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
@ -9,7 +10,7 @@
|
|||||||
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
|
||||||
std::vector<std::vector<float>> result;
|
std::vector<std::vector<float>> result;
|
||||||
|
|
||||||
const llama_model * mdl = llama_get_model(ctx);
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
@ -18,16 +19,16 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
|
|
||||||
const std::string input_string = instruction + sentences[i];
|
const std::string input_string = instruction + sentences[i];
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, input_string, true, false);
|
std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
|
||||||
|
|
||||||
const int32_t n_toks = inputs.size();
|
const int32_t n_toks = inputs.size();
|
||||||
|
|
||||||
// GritLM seems to have EOS = ""
|
// GritLM seems to have EOS = ""
|
||||||
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
|
// https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
|
||||||
// inputs.push_back(llama_token_eos(mdl));
|
// inputs.push_back(llama_token_eos(model));
|
||||||
|
|
||||||
// we want to ignore instruction tokens for mean pooling
|
// we want to ignore instruction tokens for mean pooling
|
||||||
const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size();
|
const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
|
||||||
|
|
||||||
#ifdef GRIT_DEBUG
|
#ifdef GRIT_DEBUG
|
||||||
// debug tokens - should be matching as referenced in the GritLM sample
|
// debug tokens - should be matching as referenced in the GritLM sample
|
||||||
@ -51,7 +52,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
llama_decode(ctx, batch);
|
llama_decode(ctx, batch);
|
||||||
|
|
||||||
// get embedding dimensions
|
// get embedding dimensions
|
||||||
uint64_t n_embd = llama_n_embd(mdl);
|
uint64_t n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
// allocate embedding output
|
// allocate embedding output
|
||||||
std::vector<float> emb_unorm(n_embd, 0.0f);
|
std::vector<float> emb_unorm(n_embd, 0.0f);
|
||||||
@ -92,11 +93,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) {
|
static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
|
||||||
const llama_model * mdl = llama_get_model(ctx);
|
const llama_model * model = llama_get_model(ctx);
|
||||||
llama_token eos_token = llama_token_eos(mdl);
|
llama_token eos_token = llama_token_eos(model);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
llama_set_embeddings(ctx, false);
|
llama_set_embeddings(ctx, false);
|
||||||
@ -104,28 +105,24 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
|||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
|
||||||
int32_t i_current_token = 0;
|
int32_t i_current_token = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
llama_batch_clear(bat);
|
llama_batch_clear(bat);
|
||||||
auto n_inputs = (int32_t)inputs.size();
|
{
|
||||||
for (int32_t i = 0; i < n_inputs; i++) {
|
const int32_t n_inputs = inputs.size();
|
||||||
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
|
||||||
|
for (int32_t i = 0; i < n_inputs; i++) {
|
||||||
|
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inputs.clear();
|
inputs.clear();
|
||||||
|
|
||||||
llama_decode(ctx, bat);
|
llama_decode(ctx, bat);
|
||||||
auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1);
|
|
||||||
|
|
||||||
auto candidates = std::vector<llama_token_data>(llama_n_vocab(mdl));
|
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
||||||
auto n_candidates = (int32_t)candidates.size();
|
|
||||||
for (int32_t token = 0; token < n_candidates; token++) {
|
|
||||||
candidates[token] = llama_token_data{ token, logits[token], 0.0f };
|
|
||||||
}
|
|
||||||
auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
llama_token token = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
if (token == eos_token) {
|
if (token == eos_token) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -157,20 +154,29 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create generation context
|
// create generation context
|
||||||
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
sparams.no_perf = false;
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
// samples taken from: https://github.com/ContextualAI/gritlm#basic
|
||||||
@ -191,7 +197,7 @@ int main(int argc, char * argv[]) {
|
|||||||
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
|
||||||
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
const std::vector<std::vector<float>> q_rep = encode(ctx, queries, gritlm_instruction(instruction));
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(mdl);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
||||||
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
||||||
@ -208,11 +214,12 @@ int main(int argc, char * argv[]) {
|
|||||||
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
// GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
|
||||||
{
|
{
|
||||||
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
|
||||||
std::string response = generate(ctx, prompt, true);
|
std::string response = generate(ctx, smpl, prompt, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(mdl);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -17,15 +19,13 @@
|
|||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
LOG("\nexample usage:\n");
|
||||||
|
LOG("\n %s \\\n"
|
||||||
LOG_TEE("\nexample usage:\n");
|
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
|
||||||
LOG_TEE("\n %s \\\n"
|
|
||||||
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
|
||||||
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
||||||
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Stats {
|
struct Stats {
|
||||||
@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
if (m_params.verbosity > 1) {
|
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
|
||||||
}
|
|
||||||
// loop over all possible experts, regardless if they are used or not in the batch
|
// loop over all possible experts, regardless if they are used or not in the batch
|
||||||
for (int ex = 0; ex < n_as; ++ex) {
|
for (int ex = 0; ex < n_as; ++ex) {
|
||||||
size_t e_start = ex*src1->ne[0];
|
size_t e_start = ex*src1->ne[0];
|
||||||
@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.values[e_start + j] += x[j]*x[j];
|
e.values[e_start + j] += x[j]*x[j];
|
||||||
e.counts[e_start + j]++;
|
e.counts[e_start + j]++;
|
||||||
if (!std::isfinite(e.values[e_start + j])) {
|
if (!std::isfinite(e.values[e_start + j])) {
|
||||||
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
LOG("\n");
|
||||||
|
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.counts.resize(src1->ne[0], 0);
|
e.counts.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
|
||||||
}
|
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[j] += x[j]*x[j];
|
e.values[j] += x[j]*x[j];
|
||||||
e.counts[j]++;
|
e.counts[j]++;
|
||||||
if (!std::isfinite(e.values[j])) {
|
if (!std::isfinite(e.values[j])) {
|
||||||
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
|
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros != 0 && is_first) {
|
if (n_zeros != 0 && is_first) {
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
is_first = false;
|
is_first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros == n_all) {
|
if (n_zeros == n_all) {
|
||||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros > 0) {
|
if (n_zeros > 0) {
|
||||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (to_store.size() < m_stats.size()) {
|
if (to_store.size() < m_stats.size()) {
|
||||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ofstream out(fname, std::ios::binary);
|
std::ofstream out(fname, std::ios::binary);
|
||||||
@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
out.write(m_params.prompt_file.c_str(), len);
|
out.write(m_params.prompt_file.c_str(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_params.verbosity > 0) {
|
LOGV(1, "\n");
|
||||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IMatrixCollector::load_imatrix(const char * fname) {
|
bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
std::ifstream in(fname, std::ios::binary);
|
std::ifstream in(fname, std::ios::binary);
|
||||||
if (!in) {
|
if (!in) {
|
||||||
printf("%s: failed to open %s\n",__func__, fname);
|
LOG_ERR("%s: failed to open %s\n",__func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int n_entries;
|
int n_entries;
|
||||||
in.read((char*)&n_entries, sizeof(n_entries));
|
in.read((char*)&n_entries, sizeof(n_entries));
|
||||||
if (in.fail() || n_entries < 1) {
|
if (in.fail() || n_entries < 1) {
|
||||||
printf("%s: no data in file %s\n", __func__, fname);
|
LOG_ERR("%s: no data in file %s\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < n_entries; ++i) {
|
for (int i = 0; i < n_entries; ++i) {
|
||||||
@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
std::vector<char> name_as_vec(len+1);
|
std::vector<char> name_as_vec(len+1);
|
||||||
in.read((char *)name_as_vec.data(), len);
|
in.read((char *)name_as_vec.data(), len);
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
name_as_vec[len] = 0;
|
name_as_vec[len] = 0;
|
||||||
@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
int nval;
|
int nval;
|
||||||
in.read((char *)&nval, sizeof(nval));
|
in.read((char *)&nval, sizeof(nval));
|
||||||
if (in.fail() || nval < 1) {
|
if (in.fail() || nval < 1) {
|
||||||
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
std::vector<float> tmp(nval);
|
std::vector<float> tmp(nval);
|
||||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -438,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
if (params.i_chunk > 0) {
|
if (params.i_chunk > 0) {
|
||||||
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||||
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
||||||
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
|
||||||
n_ctx);
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
double nll2 = 0.0;
|
double nll2 = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
@ -515,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -532,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.compute_ppl) {
|
if (params.compute_ppl) {
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||||
count += n_ctx - first - 1;
|
count += n_ctx - first - 1;
|
||||||
|
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (params.compute_ppl) {
|
if (params.compute_ppl) {
|
||||||
nll2 /= count;
|
nll2 /= count;
|
||||||
@ -563,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
nll2 -= nll * nll;
|
nll2 -= nll * nll;
|
||||||
if (nll2 > 0) {
|
if (nll2 > 0) {
|
||||||
nll2 = sqrt(nll2/(count-1));
|
nll2 = sqrt(nll2/(count-1));
|
||||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
} else {
|
} else {
|
||||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
LOG("Unexpected negative standard deviation of log(prob)\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -577,27 +572,27 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
g_collector.set_params(params);
|
g_collector.set_params(params);
|
||||||
|
|
||||||
for (const auto & in_file : params.in_files) {
|
for (const auto & in_file : params.in_files) {
|
||||||
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||||
if (!g_collector.load_imatrix(in_file.c_str())) {
|
if (!g_collector.load_imatrix(in_file.c_str())) {
|
||||||
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
|
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.in_files.size() > 1) {
|
if (params.in_files.size() > 1) {
|
||||||
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -616,20 +611,20 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, params.n_ctx);
|
__func__, n_ctx_train, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!compute_imatrix(ctx, params)) {
|
if (!compute_imatrix(ctx, params)) {
|
||||||
@ -638,7 +633,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "grammar-parser.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
@ -34,6 +35,7 @@
|
|||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
|
static gpt_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static gpt_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
@ -54,7 +56,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -63,7 +65,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -81,7 +83,7 @@ static void write_logfile(
|
|||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,8 +94,8 @@ static void sigint_handler(int signo) {
|
|||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
llama_print_timings(*g_ctx);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
@ -103,109 +105,95 @@ static void sigint_handler(int signo) {
|
|||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
llama_sampling_params & sparams = params.sparams;
|
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
gpt_init();
|
||||||
log_set_target(log_filename_generator("infill", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
auto & sparams = params.sparams;
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.embedding) {
|
if (params.embedding) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_base != 0.0) {
|
if (params.rope_freq_base != 0.0) {
|
||||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_scale != 0.0) {
|
if (params.rope_freq_scale != 0.0) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
LOG_INF("%s: llama backend init\n", __func__);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx;
|
llama_context * ctx = nullptr;
|
||||||
|
gpt_sampler * smpl = nullptr;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||||
__func__, n_ctx_train, n_ctx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
GGML_ASSERT(!llama_add_eos_token(model));
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
LOG("add_bos: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
std::vector<llama_token> embd_end;
|
std::vector<llama_token> embd_end;
|
||||||
@ -230,18 +218,19 @@ int main(int argc, char ** argv) {
|
|||||||
embd_inp.push_back(middle_token);
|
embd_inp.push_back(middle_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
LOG_DBG("add_bos: %d\n", add_bos);
|
||||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||||
|
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -250,9 +239,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
@ -260,21 +248,21 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("'\n");
|
LOG("'\n");
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -291,30 +279,30 @@ int main(int argc, char ** argv) {
|
|||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG_TEE("Input prefix with BOS\n");
|
LOG_INF("Input prefix with BOS\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
||||||
LOG_TEE("\n\n");
|
|
||||||
|
|
||||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
if (params.infill) {
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
printf("\n************\n");
|
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||||
printf("no need to specify '--infill', always running infill\n");
|
|
||||||
printf("************\n\n");
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
}
|
|
||||||
|
LOG("\n");
|
||||||
|
LOG("\n##### Infill mode #####\n\n");
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char *control_message;
|
const char *control_message;
|
||||||
if (params.multiline_input) {
|
if (params.multiline_input) {
|
||||||
@ -325,11 +313,11 @@ int main(int argc, char ** argv) {
|
|||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
" - To return control without starting a new line, end your input with '/'.\n"
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
" - If you want to submit another line, end your input with '\\'.\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("== Running in interactive mode. ==\n");
|
LOG("== Running in interactive mode. ==\n");
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||||
#endif
|
#endif
|
||||||
LOG_TEE( "%s\n", control_message);
|
LOG( "%s\n", control_message);
|
||||||
|
|
||||||
is_interacting = params.interactive_first;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
@ -349,8 +337,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
while (n_remain != 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
if (!embd.empty()) {
|
if (!embd.empty()) {
|
||||||
@ -364,9 +350,8 @@ int main(int argc, char ** argv) {
|
|||||||
embd.resize(max_embd_size);
|
embd.resize(max_embd_size);
|
||||||
|
|
||||||
console::set_display(console::error);
|
console::set_display(console::error);
|
||||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// infinite text generation via context swapping
|
// infinite text generation via context swapping
|
||||||
@ -375,14 +360,14 @@ int main(int argc, char ** argv) {
|
|||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() > n_ctx) {
|
if (n_past + (int) embd.size() > n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_left = n_past - params.n_keep - 1;
|
const int n_left = n_past - params.n_keep - 1;
|
||||||
const int n_discard = n_left/2;
|
const int n_discard = n_left/2;
|
||||||
|
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||||
@ -390,9 +375,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
LOG("after swap: n_past = %d\n", n_past);
|
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -404,16 +389,16 @@ int main(int argc, char ** argv) {
|
|||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
|
|
||||||
LOG("n_past = %d\n", n_past);
|
LOG_DBG("n_past = %d\n", n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -421,11 +406,11 @@ int main(int argc, char ** argv) {
|
|||||||
embd.clear();
|
embd.clear();
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
|
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -435,16 +420,16 @@ int main(int argc, char ** argv) {
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
// some user input remains from prompt or interaction, forward it to processing
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
@ -457,7 +442,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
input_tokens.push_back(id);
|
input_tokens.push_back(id);
|
||||||
@ -466,7 +451,6 @@ int main(int argc, char ** argv) {
|
|||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
// reset color to default if we there is no pending user input
|
// reset color to default if we there is no pending user input
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||||
@ -476,13 +460,12 @@ int main(int argc, char ** argv) {
|
|||||||
// if not currently processing queued inputs;
|
// if not currently processing queued inputs;
|
||||||
if ((int) embd_inp.size() <= n_consumed) {
|
if ((int) embd_inp.size() <= n_consumed) {
|
||||||
// deal with eot token in infill mode
|
// deal with eot token in infill mode
|
||||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
|
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||||
if (is_interacting && !params.interactive_first) {
|
if (is_interacting && !params.interactive_first) {
|
||||||
// print an eot token
|
// print an eot token
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
LOG("\n");
|
||||||
printf("\n");
|
|
||||||
console::set_display(console::user_input);
|
console::set_display(console::user_input);
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -538,35 +521,33 @@ int main(int argc, char ** argv) {
|
|||||||
n_remain = params.n_predict;
|
n_remain = params.n_predict;
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
n_consumed = 0;
|
n_consumed = 0;
|
||||||
// LOG_TEE("took new input\n");
|
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found EOS token\n");
|
LOG_DBG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
console::set_display(console::user_input);
|
console::set_display(console::user_input);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||||
LOG("waiting for user input\n");
|
LOG_DBG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG("adding input prefix BOS token\n");
|
LOG_DBG("adding input prefix BOS token\n");
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
buffer += params.input_prefix;
|
buffer += params.input_prefix;
|
||||||
printf("%s", buffer.c_str());
|
LOG("%s", buffer.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -584,17 +565,17 @@ int main(int argc, char ** argv) {
|
|||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
buffer += params.input_suffix;
|
buffer += params.input_suffix;
|
||||||
printf("%s", params.input_suffix.c_str());
|
LOG("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("buffer: '%s'\n", buffer.c_str());
|
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
@ -605,9 +586,9 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
LOG("empty line, passing control back\n");
|
LOG_DBG("empty line, passing control back\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
input_echo = false; // do not echo this again
|
input_echo = false; // do not echo this again
|
||||||
@ -615,7 +596,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
llama_sampling_reset(ctx_sampling);
|
gpt_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
@ -634,22 +615,18 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!params.interactive && n_remain <= 0) {
|
if (!params.interactive && n_remain <= 0) {
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
LOG_TEE("Log end\n");
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
|
|||||||
1. [Markdown](#markdown)
|
1. [Markdown](#markdown)
|
||||||
2. [CSV](#csv)
|
2. [CSV](#csv)
|
||||||
3. [JSON](#json)
|
3. [JSON](#json)
|
||||||
4. [SQL](#sql)
|
4. [JSONL](#jsonl)
|
||||||
|
5. [SQL](#sql)
|
||||||
|
|
||||||
## Syntax
|
## Syntax
|
||||||
|
|
||||||
@ -23,27 +24,34 @@ usage: ./llama-bench [options]
|
|||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help
|
-h, --help
|
||||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||||
-p, --n-prompt <n> (default: 512)
|
-p, --n-prompt <n> (default: 512)
|
||||||
-n, --n-gen <n> (default: 128)
|
-n, --n-gen <n> (default: 128)
|
||||||
-pg <pp,tg> (default: 512,128)
|
-pg <pp,tg> (default: )
|
||||||
-b, --batch-size <n> (default: 2048)
|
-b, --batch-size <n> (default: 2048)
|
||||||
-ub, --ubatch-size <n> (default: 512)
|
-ub, --ubatch-size <n> (default: 512)
|
||||||
-ctk, --cache-type-k <t> (default: f16)
|
-ctk, --cache-type-k <t> (default: f16)
|
||||||
-ctv, --cache-type-v <t> (default: f16)
|
-ctv, --cache-type-v <t> (default: f16)
|
||||||
-t, --threads <n> (default: 16)
|
-t, --threads <n> (default: 8)
|
||||||
-ngl, --n-gpu-layers <n> (default: 99)
|
-C, --cpu-mask <hex,hex> (default: 0x0)
|
||||||
-sm, --split-mode <none|layer|row> (default: layer)
|
--cpu-strict <0|1> (default: 0)
|
||||||
-mg, --main-gpu <i> (default: 0)
|
--poll <0...100> (default: 50)
|
||||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
-ngl, --n-gpu-layers <n> (default: 99)
|
||||||
-fa, --flash-attn <0|1> (default: 0)
|
-rpc, --rpc <rpc_servers> (default: )
|
||||||
-mmp, --mmap <0|1> (default: 1)
|
-sm, --split-mode <none|layer|row> (default: layer)
|
||||||
--numa <distribute|isolate|numactl> (default: disabled)
|
-mg, --main-gpu <i> (default: 0)
|
||||||
-embd, --embeddings <0|1> (default: 0)
|
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
-fa, --flash-attn <0|1> (default: 0)
|
||||||
-r, --repetitions <n> (default: 5)
|
-mmp, --mmap <0|1> (default: 1)
|
||||||
-o, --output <csv|json|md|sql> (default: md)
|
--numa <distribute|isolate|numactl> (default: disabled)
|
||||||
-v, --verbose (default: 0)
|
-embd, --embeddings <0|1> (default: 0)
|
||||||
|
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||||
|
-r, --repetitions <n> (default: 5)
|
||||||
|
--prio <0|1|2|3> (default: 0)
|
||||||
|
--delay <0...N> (seconds) (default: 0)
|
||||||
|
-o, --output <csv|json|jsonl|md|sql> (default: md)
|
||||||
|
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
|
||||||
|
-v, --verbose (default: 0)
|
||||||
|
|
||||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||||
```
|
```
|
||||||
@ -238,6 +246,19 @@ $ ./llama-bench -o json
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### JSONL
|
||||||
|
|
||||||
|
```sh
|
||||||
|
$ ./llama-bench -o jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
|
||||||
|
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### SQL
|
### SQL
|
||||||
|
|
||||||
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
|
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
@ -123,6 +124,9 @@ static std::string get_cpu_info() {
|
|||||||
(LPBYTE)cpu_brand,
|
(LPBYTE)cpu_brand,
|
||||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||||
id.assign(cpu_brand, cpu_brand_size);
|
id.assign(cpu_brand, cpu_brand_size);
|
||||||
|
if (id.find('\0') != std::string::npos) {
|
||||||
|
id.resize(id.find('\0'));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
RegCloseKey(hKey);
|
RegCloseKey(hKey);
|
||||||
#endif
|
#endif
|
||||||
@ -170,13 +174,14 @@ static std::string get_gpu_info() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// command line params
|
// command line params
|
||||||
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
|
enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
|
||||||
|
|
||||||
static const char * output_format_str(output_formats format) {
|
static const char * output_format_str(output_formats format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case NONE: return "none";
|
case NONE: return "none";
|
||||||
case CSV: return "csv";
|
case CSV: return "csv";
|
||||||
case JSON: return "json";
|
case JSON: return "json";
|
||||||
|
case JSONL: return "jsonl";
|
||||||
case MARKDOWN: return "md";
|
case MARKDOWN: return "md";
|
||||||
case SQL: return "sql";
|
case SQL: return "sql";
|
||||||
default: GGML_ABORT("invalid output format");
|
default: GGML_ABORT("invalid output format");
|
||||||
@ -190,6 +195,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
|
|||||||
format = CSV;
|
format = CSV;
|
||||||
} else if (s == "json") {
|
} else if (s == "json") {
|
||||||
format = JSON;
|
format = JSON;
|
||||||
|
} else if (s == "jsonl") {
|
||||||
|
format = JSONL;
|
||||||
} else if (s == "md") {
|
} else if (s == "md") {
|
||||||
format = MARKDOWN;
|
format = MARKDOWN;
|
||||||
} else if (s == "sql") {
|
} else if (s == "sql") {
|
||||||
@ -225,6 +232,9 @@ struct cmd_params {
|
|||||||
std::vector<ggml_type> type_k;
|
std::vector<ggml_type> type_k;
|
||||||
std::vector<ggml_type> type_v;
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
|
std::vector<std::string> cpu_mask;
|
||||||
|
std::vector<bool> cpu_strict;
|
||||||
|
std::vector<int> poll;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<std::string> rpc_servers;
|
std::vector<std::string> rpc_servers;
|
||||||
std::vector<llama_split_mode> split_mode;
|
std::vector<llama_split_mode> split_mode;
|
||||||
@ -236,7 +246,10 @@ struct cmd_params {
|
|||||||
std::vector<bool> embeddings;
|
std::vector<bool> embeddings;
|
||||||
ggml_numa_strategy numa;
|
ggml_numa_strategy numa;
|
||||||
int reps;
|
int reps;
|
||||||
|
ggml_sched_priority prio;
|
||||||
|
int delay;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
|
bool progress;
|
||||||
output_formats output_format;
|
output_formats output_format;
|
||||||
output_formats output_format_stderr;
|
output_formats output_format_stderr;
|
||||||
};
|
};
|
||||||
@ -251,6 +264,9 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {cpu_get_num_math()},
|
/* n_threads */ {cpu_get_num_math()},
|
||||||
|
/* cpu_mask */ {"0x0"},
|
||||||
|
/* cpu_strict */ {false},
|
||||||
|
/* poll */ {50},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* rpc_servers */ {""},
|
/* rpc_servers */ {""},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
@ -262,7 +278,10 @@ static const cmd_params cmd_params_defaults = {
|
|||||||
/* embeddings */ {false},
|
/* embeddings */ {false},
|
||||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||||
|
/* delay */ 0,
|
||||||
/* verbose */ false,
|
/* verbose */ false,
|
||||||
|
/* progress */ false,
|
||||||
/* output_format */ MARKDOWN,
|
/* output_format */ MARKDOWN,
|
||||||
/* output_format_stderr */ NONE,
|
/* output_format_stderr */ NONE,
|
||||||
};
|
};
|
||||||
@ -272,29 +291,37 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help\n");
|
printf(" -h, --help\n");
|
||||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
||||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
#ifdef GGML_USE_RPC
|
||||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
#endif
|
||||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
|
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||||
|
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||||
|
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||||
|
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||||
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
|
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
}
|
}
|
||||||
@ -338,6 +365,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
params.numa = cmd_params_defaults.numa;
|
params.numa = cmd_params_defaults.numa;
|
||||||
|
params.prio = cmd_params_defaults.prio;
|
||||||
|
params.delay = cmd_params_defaults.delay;
|
||||||
|
params.progress = cmd_params_defaults.progress;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
@ -409,6 +439,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
types.push_back(gt);
|
types.push_back(gt);
|
||||||
}
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
||||||
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
@ -425,6 +458,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
types.push_back(gt);
|
types.push_back(gt);
|
||||||
}
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
@ -433,6 +469,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<std::string>(argv[i], split_delim);
|
||||||
|
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--cpu-strict") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<bool>(argv[i], split_delim);
|
||||||
|
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--poll") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
|
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -440,12 +497,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
} else if (arg == "-rpc" || arg == "--rpc") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rpc_servers.push_back(argv[i]);
|
params.rpc_servers.push_back(argv[i]);
|
||||||
|
#endif
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -467,6 +526,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
modes.push_back(mode);
|
modes.push_back(mode);
|
||||||
}
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
||||||
} else if (arg == "-mg" || arg == "--main-gpu") {
|
} else if (arg == "-mg" || arg == "--main-gpu") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
@ -541,6 +603,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.reps = std::stoi(argv[i]);
|
params.reps = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--prio") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--delay") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.delay = std::stoi(argv[i]);
|
||||||
} else if (arg == "-o" || arg == "--output") {
|
} else if (arg == "-o" || arg == "--output") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
@ -555,6 +629,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
||||||
} else if (arg == "-v" || arg == "--verbose") {
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
params.verbose = true;
|
params.verbose = true;
|
||||||
|
} else if (arg == "--progress") {
|
||||||
|
params.progress = true;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
@ -585,6 +661,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||||
|
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||||
|
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||||
|
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
@ -598,6 +677,9 @@ struct cmd_params_instance {
|
|||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
std::string rpc_servers;
|
std::string rpc_servers;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
@ -667,7 +749,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & nkvo : params.no_kv_offload)
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & fa : params.flash_attn)
|
for (const auto & fa : params.flash_attn)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads)
|
||||||
|
for (const auto & cm : params.cpu_mask)
|
||||||
|
for (const auto & cs : params.cpu_strict)
|
||||||
|
for (const auto & pl : params.poll) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
continue;
|
continue;
|
||||||
@ -681,6 +766,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -707,6 +795,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -733,6 +824,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
@ -769,6 +863,9 @@ struct test {
|
|||||||
int n_batch;
|
int n_batch;
|
||||||
int n_ubatch;
|
int n_ubatch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
bool has_rpc;
|
bool has_rpc;
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
@ -795,6 +892,9 @@ struct test {
|
|||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_ubatch = inst.n_ubatch;
|
n_ubatch = inst.n_ubatch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
|
cpu_mask = inst.cpu_mask;
|
||||||
|
cpu_strict = inst.cpu_strict;
|
||||||
|
poll = inst.poll;
|
||||||
has_rpc = !inst.rpc_servers.empty();
|
has_rpc = !inst.rpc_servers.empty();
|
||||||
type_k = inst.type_k;
|
type_k = inst.type_k;
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
@ -872,13 +972,14 @@ struct test {
|
|||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
"n_threads", "type_k", "type_v",
|
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||||
|
"type_k", "type_v",
|
||||||
"n_gpu_layers", "split_mode",
|
"n_gpu_layers", "split_mode",
|
||||||
"main_gpu", "no_kv_offload", "flash_attn",
|
"main_gpu", "no_kv_offload", "flash_attn",
|
||||||
"tensor_split", "use_mmap", "embeddings",
|
"tensor_split", "use_mmap", "embeddings",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
"avg_ts", "stddev_ts"
|
"avg_ts", "stddev_ts",
|
||||||
};
|
};
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
@ -887,7 +988,7 @@ struct test {
|
|||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||||
field == "n_threads" ||
|
field == "n_threads" || field == "poll" ||
|
||||||
field == "model_size" || field == "model_n_params" ||
|
field == "model_size" || field == "model_n_params" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
field == "n_prompt" || field == "n_gen" ||
|
||||||
@ -896,6 +997,7 @@ struct test {
|
|||||||
}
|
}
|
||||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
|
field == "cpu_strict" ||
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
@ -928,7 +1030,8 @@ struct test {
|
|||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||||
|
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||||
@ -996,38 +1099,39 @@ struct csv_printer : public printer {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
static std::string escape_json(const std::string & value) {
|
||||||
|
std::string escaped;
|
||||||
|
for (auto c : value) {
|
||||||
|
if (c == '"') {
|
||||||
|
escaped += "\\\"";
|
||||||
|
} else if (c == '\\') {
|
||||||
|
escaped += "\\\\";
|
||||||
|
} else if (c <= 0x1f) {
|
||||||
|
char buf[8];
|
||||||
|
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
||||||
|
escaped += buf;
|
||||||
|
} else {
|
||||||
|
escaped += c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return escaped;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string format_json_value(const std::string & field, const std::string & value) {
|
||||||
|
switch (test::get_field_type(field)) {
|
||||||
|
case test::STRING:
|
||||||
|
return "\"" + escape_json(value) + "\"";
|
||||||
|
case test::BOOL:
|
||||||
|
return value == "0" ? "false" : "true";
|
||||||
|
default:
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct json_printer : public printer {
|
struct json_printer : public printer {
|
||||||
bool first = true;
|
bool first = true;
|
||||||
|
|
||||||
static std::string escape_json(const std::string & value) {
|
|
||||||
std::string escaped;
|
|
||||||
for (auto c : value) {
|
|
||||||
if (c == '"') {
|
|
||||||
escaped += "\\\"";
|
|
||||||
} else if (c == '\\') {
|
|
||||||
escaped += "\\\\";
|
|
||||||
} else if (c <= 0x1f) {
|
|
||||||
char buf[8];
|
|
||||||
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
||||||
escaped += buf;
|
|
||||||
} else {
|
|
||||||
escaped += c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return escaped;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string format_value(const std::string & field, const std::string & value) {
|
|
||||||
switch (test::get_field_type(field)) {
|
|
||||||
case test::STRING:
|
|
||||||
return "\"" + escape_json(value) + "\"";
|
|
||||||
case test::BOOL:
|
|
||||||
return value == "0" ? "false" : "true";
|
|
||||||
default:
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_header(const cmd_params & params) override {
|
void print_header(const cmd_params & params) override {
|
||||||
fprintf(fout, "[\n");
|
fprintf(fout, "[\n");
|
||||||
(void) params;
|
(void) params;
|
||||||
@ -1036,7 +1140,7 @@ struct json_printer : public printer {
|
|||||||
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
assert(fields.size() == values.size());
|
assert(fields.size() == values.size());
|
||||||
for (size_t i = 0; i < fields.size(); i++) {
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
|
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1059,6 +1163,25 @@ struct json_printer : public printer {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
struct jsonl_printer : public printer {
|
||||||
|
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
||||||
|
assert(fields.size() == values.size());
|
||||||
|
for (size_t i = 0; i < fields.size(); i++) {
|
||||||
|
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_test(const test & t) override {
|
||||||
|
fprintf(fout, "{");
|
||||||
|
print_fields(test::get_fields(), t.get_values());
|
||||||
|
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
|
||||||
|
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
|
||||||
|
fprintf(fout, "}\n");
|
||||||
|
fflush(fout);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct markdown_printer : public printer {
|
struct markdown_printer : public printer {
|
||||||
std::vector<std::string> fields;
|
std::vector<std::string> fields;
|
||||||
|
|
||||||
@ -1067,7 +1190,7 @@ struct markdown_printer : public printer {
|
|||||||
return -30;
|
return -30;
|
||||||
}
|
}
|
||||||
if (field == "t/s") {
|
if (field == "t/s") {
|
||||||
return 16;
|
return 20;
|
||||||
}
|
}
|
||||||
if (field == "size" || field == "params") {
|
if (field == "size" || field == "params") {
|
||||||
return 10;
|
return 10;
|
||||||
@ -1149,6 +1272,15 @@ struct markdown_printer : public printer {
|
|||||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||||
fields.emplace_back("n_threads");
|
fields.emplace_back("n_threads");
|
||||||
}
|
}
|
||||||
|
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||||
|
fields.emplace_back("cpu_mask");
|
||||||
|
}
|
||||||
|
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||||
|
fields.emplace_back("cpu_strict");
|
||||||
|
}
|
||||||
|
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||||
|
fields.emplace_back("poll");
|
||||||
|
}
|
||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.emplace_back("n_batch");
|
fields.emplace_back("n_batch");
|
||||||
}
|
}
|
||||||
@ -1350,6 +1482,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
|
|||||||
return std::unique_ptr<printer>(new csv_printer());
|
return std::unique_ptr<printer>(new csv_printer());
|
||||||
case JSON:
|
case JSON:
|
||||||
return std::unique_ptr<printer>(new json_printer());
|
return std::unique_ptr<printer>(new json_printer());
|
||||||
|
case JSONL:
|
||||||
|
return std::unique_ptr<printer>(new jsonl_printer());
|
||||||
case MARKDOWN:
|
case MARKDOWN:
|
||||||
return std::unique_ptr<printer>(new markdown_printer());
|
return std::unique_ptr<printer>(new markdown_printer());
|
||||||
case SQL:
|
case SQL:
|
||||||
@ -1383,6 +1517,8 @@ int main(int argc, char ** argv) {
|
|||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
set_process_priority(params.prio);
|
||||||
|
|
||||||
// initialize printer
|
// initialize printer
|
||||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||||
@ -1402,7 +1538,13 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * lmodel = nullptr;
|
llama_model * lmodel = nullptr;
|
||||||
const cmd_params_instance * prev_inst = nullptr;
|
const cmd_params_instance * prev_inst = nullptr;
|
||||||
|
|
||||||
|
int params_idx = 0;
|
||||||
|
auto params_count = params_instances.size();
|
||||||
for (const auto & inst : params_instances) {
|
for (const auto & inst : params_instances) {
|
||||||
|
params_idx ++;
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
|
||||||
|
}
|
||||||
// keep the same model between tests when possible
|
// keep the same model between tests when possible
|
||||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||||
if (lmodel) {
|
if (lmodel) {
|
||||||
@ -1428,12 +1570,40 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
// cool off before the test
|
||||||
|
if (params.delay) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||||
|
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||||
|
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
tpp.strict_cpu = t.cpu_strict;
|
||||||
|
tpp.poll = t.poll;
|
||||||
|
tpp.prio = params.prio;
|
||||||
|
|
||||||
|
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
|
||||||
|
}
|
||||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
|
||||||
|
}
|
||||||
test_gen(ctx, 1, 0, t.n_threads);
|
test_gen(ctx, 1, 0, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1443,9 +1613,15 @@ int main(int argc, char ** argv) {
|
|||||||
uint64_t t_start = get_time_ns();
|
uint64_t t_start = get_time_ns();
|
||||||
|
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
||||||
|
}
|
||||||
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
||||||
}
|
}
|
||||||
if (t.n_gen > 0) {
|
if (t.n_gen > 0) {
|
||||||
|
if (params.progress) {
|
||||||
|
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
|
||||||
|
}
|
||||||
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1463,9 +1639,11 @@ int main(int argc, char ** argv) {
|
|||||||
fflush(p_err->fout);
|
fflush(p_err->fout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
|
@ -120,8 +120,8 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
|
|||||||
LOGi("Using %d threads", n_threads);
|
LOGi("Using %d threads", n_threads);
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
ctx_params.seed = 1234;
|
|
||||||
ctx_params.n_ctx = 2048;
|
ctx_params.n_ctx = 2048;
|
||||||
ctx_params.n_threads = n_threads;
|
ctx_params.n_threads = n_threads;
|
||||||
ctx_params.n_threads_batch = n_threads;
|
ctx_params.n_threads_batch = n_threads;
|
||||||
|
|
||||||
@ -269,12 +269,6 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|||||||
return env->NewStringUTF(result.str().c_str());
|
return env->NewStringUTF(result.str().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
|
||||||
JNIEXPORT void JNICALL
|
|
||||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
|
||||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||||
@ -311,6 +305,29 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
|||||||
return reinterpret_cast<jlong>(batch);
|
return reinterpret_cast<jlong>(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT jlong JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_new_1sampler(JNIEnv *, jobject) {
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
sparams.no_perf = true;
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
|
return reinterpret_cast<jlong>(smpl);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
JNIEXPORT void JNICALL
|
||||||
|
Java_android_llama_cpp_LLamaAndroid_free_1sampler(JNIEnv *, jobject, jlong sampler_pointer) {
|
||||||
|
llama_sampler_free(reinterpret_cast<llama_sampler *>(sampler_pointer));
|
||||||
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||||
@ -381,31 +398,21 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
jlong batch_pointer,
|
jlong batch_pointer,
|
||||||
|
jlong sampler_pointer,
|
||||||
jint n_len,
|
jint n_len,
|
||||||
jobject intvar_ncur
|
jobject intvar_ncur
|
||||||
) {
|
) {
|
||||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
|
||||||
const auto model = llama_get_model(context);
|
const auto model = llama_get_model(context);
|
||||||
|
|
||||||
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
||||||
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
||||||
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
|
||||||
|
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// sample the most likely token
|
// sample the most likely token
|
||||||
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
|
@ -45,8 +45,10 @@ class LLamaAndroid {
|
|||||||
private external fun free_context(context: Long)
|
private external fun free_context(context: Long)
|
||||||
private external fun backend_init(numa: Boolean)
|
private external fun backend_init(numa: Boolean)
|
||||||
private external fun backend_free()
|
private external fun backend_free()
|
||||||
private external fun free_batch(batch: Long)
|
|
||||||
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
|
||||||
|
private external fun free_batch(batch: Long)
|
||||||
|
private external fun new_sampler(): Long
|
||||||
|
private external fun free_sampler(sampler: Long)
|
||||||
private external fun bench_model(
|
private external fun bench_model(
|
||||||
context: Long,
|
context: Long,
|
||||||
model: Long,
|
model: Long,
|
||||||
@ -69,6 +71,7 @@ class LLamaAndroid {
|
|||||||
private external fun completion_loop(
|
private external fun completion_loop(
|
||||||
context: Long,
|
context: Long,
|
||||||
batch: Long,
|
batch: Long,
|
||||||
|
sampler: Long,
|
||||||
nLen: Int,
|
nLen: Int,
|
||||||
ncur: IntVar
|
ncur: IntVar
|
||||||
): String?
|
): String?
|
||||||
@ -101,8 +104,11 @@ class LLamaAndroid {
|
|||||||
val batch = new_batch(512, 0, 1)
|
val batch = new_batch(512, 0, 1)
|
||||||
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
if (batch == 0L) throw IllegalStateException("new_batch() failed")
|
||||||
|
|
||||||
|
val sampler = new_sampler()
|
||||||
|
if (sampler == 0L) throw IllegalStateException("new_sampler() failed")
|
||||||
|
|
||||||
Log.i(tag, "Loaded model $pathToModel")
|
Log.i(tag, "Loaded model $pathToModel")
|
||||||
threadLocalState.set(State.Loaded(model, context, batch))
|
threadLocalState.set(State.Loaded(model, context, batch, sampler))
|
||||||
}
|
}
|
||||||
else -> throw IllegalStateException("Model already loaded")
|
else -> throw IllegalStateException("Model already loaded")
|
||||||
}
|
}
|
||||||
@ -114,7 +120,7 @@ class LLamaAndroid {
|
|||||||
is State.Loaded -> {
|
is State.Loaded -> {
|
||||||
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
|
||||||
while (ncur.value <= nlen) {
|
while (ncur.value <= nlen) {
|
||||||
val str = completion_loop(state.context, state.batch, nlen, ncur)
|
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
|
||||||
if (str == null) {
|
if (str == null) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@ -138,6 +144,7 @@ class LLamaAndroid {
|
|||||||
free_context(state.context)
|
free_context(state.context)
|
||||||
free_model(state.model)
|
free_model(state.model)
|
||||||
free_batch(state.batch)
|
free_batch(state.batch)
|
||||||
|
free_sampler(state.sampler);
|
||||||
|
|
||||||
threadLocalState.set(State.Idle)
|
threadLocalState.set(State.Idle)
|
||||||
}
|
}
|
||||||
@ -161,7 +168,7 @@ class LLamaAndroid {
|
|||||||
|
|
||||||
private sealed interface State {
|
private sealed interface State {
|
||||||
data object Idle: State
|
data object Idle: State
|
||||||
data class Loaded(val model: Long, val context: Long, val batch: Long): State
|
data class Loaded(val model: Long, val context: Long, val batch: Long, val sampler: Long): State
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enforce only one instance of Llm.
|
// Enforce only one instance of Llm.
|
||||||
|
@ -24,6 +24,7 @@ func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama
|
|||||||
actor LlamaContext {
|
actor LlamaContext {
|
||||||
private var model: OpaquePointer
|
private var model: OpaquePointer
|
||||||
private var context: OpaquePointer
|
private var context: OpaquePointer
|
||||||
|
private var sampling: UnsafeMutablePointer<llama_sampler>
|
||||||
private var batch: llama_batch
|
private var batch: llama_batch
|
||||||
private var tokens_list: [llama_token]
|
private var tokens_list: [llama_token]
|
||||||
var is_done: Bool = false
|
var is_done: Bool = false
|
||||||
@ -42,9 +43,15 @@ actor LlamaContext {
|
|||||||
self.tokens_list = []
|
self.tokens_list = []
|
||||||
self.batch = llama_batch_init(512, 0, 1)
|
self.batch = llama_batch_init(512, 0, 1)
|
||||||
self.temporary_invalid_cchars = []
|
self.temporary_invalid_cchars = []
|
||||||
|
let sparams = llama_sampler_chain_default_params()
|
||||||
|
self.sampling = llama_sampler_chain_init(sparams)
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_temp(0.4))
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_softmax())
|
||||||
|
llama_sampler_chain_add(self.sampling, llama_sampler_init_dist(1234))
|
||||||
}
|
}
|
||||||
|
|
||||||
deinit {
|
deinit {
|
||||||
|
llama_sampler_free(sampling)
|
||||||
llama_batch_free(batch)
|
llama_batch_free(batch)
|
||||||
llama_free(context)
|
llama_free(context)
|
||||||
llama_free_model(model)
|
llama_free_model(model)
|
||||||
@ -69,10 +76,9 @@ actor LlamaContext {
|
|||||||
print("Using \(n_threads) threads")
|
print("Using \(n_threads) threads")
|
||||||
|
|
||||||
var ctx_params = llama_context_default_params()
|
var ctx_params = llama_context_default_params()
|
||||||
ctx_params.seed = 1234
|
|
||||||
ctx_params.n_ctx = 2048
|
ctx_params.n_ctx = 2048
|
||||||
ctx_params.n_threads = UInt32(n_threads)
|
ctx_params.n_threads = Int32(n_threads)
|
||||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
ctx_params.n_threads_batch = Int32(n_threads)
|
||||||
|
|
||||||
let context = llama_new_context_with_model(model, ctx_params)
|
let context = llama_new_context_with_model(model, ctx_params)
|
||||||
guard let context else {
|
guard let context else {
|
||||||
@ -144,20 +150,7 @@ actor LlamaContext {
|
|||||||
func completion_loop() -> String {
|
func completion_loop() -> String {
|
||||||
var new_token_id: llama_token = 0
|
var new_token_id: llama_token = 0
|
||||||
|
|
||||||
let n_vocab = llama_n_vocab(model)
|
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
||||||
let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
|
|
||||||
|
|
||||||
var candidates = Array<llama_token_data>()
|
|
||||||
candidates.reserveCapacity(Int(n_vocab))
|
|
||||||
|
|
||||||
for token_id in 0..<n_vocab {
|
|
||||||
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
|
||||||
}
|
|
||||||
candidates.withUnsafeMutableBufferPointer() { buffer in
|
|
||||||
var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
|
|
||||||
|
|
||||||
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
|
||||||
}
|
|
||||||
|
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
|
@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
|||||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||||
-m path/to/clip-vit-large-patch14-336 \
|
-m path/to/clip-vit-large-patch14-336 \
|
||||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||||
--output-dir path/to/MobileVLM-1.7B \
|
--output-dir path/to/MobileVLM-1.7B \
|
||||||
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
|||||||
```
|
```
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||||
-m path/to/clip-vit-large-patch14-336 \
|
-m path/to/clip-vit-large-patch14-336 \
|
||||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||||
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
|||||||
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
|
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
|
||||||
```sh
|
```sh
|
||||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||||
|
@ -15,8 +15,8 @@ cd llama.cpp
|
|||||||
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
|
||||||
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
|
||||||
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
|
||||||
|
|
||||||
# quantize int4 version
|
# quantize int4 version
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
// I'll gradually clean and extend it
|
// I'll gradually clean and extend it
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "log.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
@ -40,6 +39,11 @@
|
|||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
|
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
|
||||||
//#define CLIP_DEBUG_FUNCTIONS
|
//#define CLIP_DEBUG_FUNCTIONS
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||||||
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
||||||
int i = gguf_find_key(ctx, key);
|
int i = gguf_find_key(ctx, key);
|
||||||
if (i == -1) {
|
if (i == -1) {
|
||||||
LOG_TEE("key %s not found in file\n", key);
|
LOG_ERR("key %s not found in file\n", key);
|
||||||
throw std::runtime_error(format("Missing required key: %s", key));
|
throw std::runtime_error(format("Missing required key: %s", key));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,13 +220,19 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|||||||
|
|
||||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||||
if (search.empty()) {
|
if (search.empty()) {
|
||||||
return; // Avoid infinite loop if 'search' is an empty string
|
return;
|
||||||
}
|
}
|
||||||
|
std::string builder;
|
||||||
|
builder.reserve(s.length());
|
||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
while ((pos = s.find(search, pos)) != std::string::npos) {
|
size_t last_pos = 0;
|
||||||
s.replace(pos, search.length(), replace);
|
while ((pos = s.find(search, last_pos)) != std::string::npos) {
|
||||||
pos += replace.length();
|
builder.append(s, last_pos, pos - last_pos);
|
||||||
|
builder.append(replace);
|
||||||
|
last_pos = pos + search.length();
|
||||||
}
|
}
|
||||||
|
builder.append(s, last_pos, std::string::npos);
|
||||||
|
s = std::move(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
@ -264,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|||||||
|
|
||||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||||
size_t tensor_size = ggml_nbytes(tensor);
|
size_t tensor_size = ggml_nbytes(tensor);
|
||||||
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||||
}
|
}
|
||||||
@ -282,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|||||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
|||||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -562,7 +572,7 @@ struct clip_ctx {
|
|||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -576,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
if (load_image_size == nullptr) {
|
if (load_image_size == nullptr) {
|
||||||
load_image_size = clip_image_size_init();
|
load_image_size = clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
image_size_width = load_image_size->width;
|
image_size_width = load_image_size->width;
|
||||||
image_size_height = load_image_size->height;
|
image_size_height = load_image_size->height;
|
||||||
if (is_inf) {
|
if (is_inf) {
|
||||||
@ -1041,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||||
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
|
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
|
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
||||||
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
|
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
|
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
// kv
|
// kv
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||||
__func__, n_kv, n_tensors, fname);
|
__func__, n_kv, n_tensors, fname);
|
||||||
{
|
{
|
||||||
std::map<enum ggml_type, uint32_t> n_type;
|
std::map<enum ggml_type, uint32_t> n_type;
|
||||||
@ -1066,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
n_type[type]++;
|
n_type[type]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||||
for (int i = 0; i < n_kv; i++) {
|
for (int i = 0; i < n_kv; i++) {
|
||||||
const char * name = gguf_get_key(ctx, i);
|
const char * name = gguf_get_key(ctx, i);
|
||||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||||
@ -1082,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
}
|
}
|
||||||
replace_all(value, "\n", "\\n");
|
replace_all(value, "\n", "\\n");
|
||||||
|
|
||||||
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// print type counts
|
// print type counts
|
||||||
@ -1091,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1106,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
size_t tensor_size = ggml_nbytes(cur);
|
size_t tensor_size = ggml_nbytes(cur);
|
||||||
model_size += tensor_size;
|
model_size += tensor_size;
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1133,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
new_clip->backend = ggml_backend_cuda_init(0);
|
||||||
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
|
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
new_clip->backend = ggml_backend_metal_init();
|
new_clip->backend = ggml_backend_metal_init();
|
||||||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
new_clip->backend = ggml_backend_cann_init(0);
|
new_clip->backend = ggml_backend_cann_init(0);
|
||||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
new_clip->backend = ggml_backend_vk_init(0);
|
new_clip->backend = ggml_backend_vk_init(0);
|
||||||
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!new_clip->backend) {
|
if (!new_clip->backend) {
|
||||||
new_clip->backend = ggml_backend_cpu_init();
|
new_clip->backend = ggml_backend_cpu_init();
|
||||||
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
|
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// model size and capabilities
|
// model size and capabilities
|
||||||
@ -1188,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||||
|
|
||||||
if (verbosity >= 1) {
|
if (verbosity >= 1) {
|
||||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||||
|
|
||||||
// load tensors
|
// load tensors
|
||||||
{
|
{
|
||||||
@ -1210,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
new_clip->ctx_data = ggml_init(params);
|
new_clip->ctx_data = ggml_init(params);
|
||||||
if (!new_clip->ctx_data) {
|
if (!new_clip->ctx_data) {
|
||||||
LOG_TEE("%s: ggml_init() failed\n", __func__);
|
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1218,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_TEE("cannot open model file for loading tensors\n");
|
LOG_ERR("cannot open model file for loading tensors\n");
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1240,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
fin.seekg(offset, std::ios::beg);
|
fin.seekg(offset, std::ios::beg);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
|
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1311,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 2) {
|
if (verbosity >= 2) {
|
||||||
LOG_TEE("\n%s: vision model hparams\n", __func__);
|
LOG_INF("\n%s: vision model hparams\n", __func__);
|
||||||
LOG_TEE("image_size %d\n", hparams.image_size);
|
LOG_INF("image_size %d\n", hparams.image_size);
|
||||||
LOG_TEE("patch_size %d\n", hparams.patch_size);
|
LOG_INF("patch_size %d\n", hparams.patch_size);
|
||||||
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
|
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
||||||
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
|
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||||
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
|
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
||||||
LOG_TEE("v_n_head %d\n", hparams.n_head);
|
LOG_INF("v_n_head %d\n", hparams.n_head);
|
||||||
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
|
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
||||||
LOG_TEE("v_eps %f\n", hparams.eps);
|
LOG_INF("v_eps %f\n", hparams.eps);
|
||||||
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||||
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||||
LOG_TEE("v_image_grid_pinpoints: ");
|
LOG_INF("v_image_grid_pinpoints: ");
|
||||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||||
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
|
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1365,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
@ -1394,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
try {
|
try {
|
||||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
// MobileVLM projection
|
// MobileVLM projection
|
||||||
@ -1495,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_clip;
|
return new_clip;
|
||||||
@ -1546,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
|
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
@ -1558,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_TEE("%s: failed to decode image bytes\n", __func__);
|
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
@ -1617,7 +1627,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline float clip(float x, float lower, float upper) {
|
inline int clip(int x, int lower, int upper) {
|
||||||
return std::max(lower, std::min(x, upper));
|
return std::max(lower, std::min(x, upper));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1748,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
|||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int downscaled_height = static_cast<int>(original_height * scale);
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
int wasted_resolution = (width * height) - effective_resolution;
|
||||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_resolution = wasted_resolution;
|
||||||
@ -1821,10 +1831,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
|
|||||||
return refine_size;
|
return refine_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int clip(int x, int lower, int upper) {
|
|
||||||
return std::max(lower, std::min(x, upper));
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||||
std::vector<int> candidate_split_grids_nums;
|
std::vector<int> candidate_split_grids_nums;
|
||||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||||
@ -1870,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|||||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
std::vector<std::vector<clip_image_u8 *>> images;
|
std::vector<std::vector<clip_image_u8 *>> images;
|
||||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
||||||
images.push_back(std::vector<clip_image_u8 *>());
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
|
||||||
if (multiple <= 1) {
|
if (multiple <= 1) {
|
||||||
@ -1885,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|||||||
clip_image_u8 * source_image = clip_image_u8_init();
|
clip_image_u8 * source_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||||
images[images.size()-1].push_back(source_image);
|
images[images.size()-1].push_back(source_image);
|
||||||
|
|
||||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||||
|
|
||||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||||
clip_image_u8 * refine_image = clip_image_u8_init();
|
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
// split_to_patches
|
// split_to_patches
|
||||||
int width = refine_image->nx;
|
int width = refine_image->nx;
|
||||||
@ -1952,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
int idx = 0;
|
int idx = 0;
|
||||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||||
clip_image_f32 * res = clip_image_f32_init();
|
clip_image_f32 * res = clip_image_f32_init();
|
||||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||||
res_imgs->data[idx++] = *res;
|
res_imgs->data[idx++] = *res;
|
||||||
@ -1964,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto & params = ctx->vision_model.hparams;
|
auto & params = ctx->vision_model.hparams;
|
||||||
@ -2041,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < patches.size(); i++) {
|
for (size_t i = 0; i < patches.size(); i++) {
|
||||||
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||||
clip_image_u8_free(patches[i]);
|
clip_image_u8_free(patches[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2277,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
|||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2289,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2447,7 +2453,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
@ -2519,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
new_type = type;
|
new_type = type;
|
||||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||||
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||||
}
|
}
|
||||||
const size_t n_elms = ggml_nelements(cur);
|
const size_t n_elms = ggml_nelements(cur);
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
@ -2538,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
f32_data = (float *)conv_buf.data();
|
f32_data = (float *)conv_buf.data();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LOG_TEE("Please use an input file in f32 or f16\n");
|
LOG_ERR("Please use an input file in f32 or f16\n");
|
||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -2565,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
fout.put(0);
|
fout.put(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2581,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
|
#include "base64.hpp"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||||
@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -40,11 +42,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
static const char * sample(struct gpt_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
@ -74,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||||||
size_t img_base64_str_start, img_base64_str_end;
|
size_t img_base64_str_start, img_base64_str_end;
|
||||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||||
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||||||
|
|
||||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
|
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,12 +114,10 @@ struct llava_context {
|
|||||||
struct llama_model * model = NULL;
|
struct llama_model * model = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
LOG("\n example usage:\n");
|
||||||
|
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE("\n example usage:\n");
|
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
||||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
||||||
@ -127,16 +127,16 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||||||
auto prompt = params->prompt;
|
auto prompt = params->prompt;
|
||||||
if (prompt_contains_image(prompt)) {
|
if (prompt_contains_image(prompt)) {
|
||||||
if (!params->image.empty()) {
|
if (!params->image.empty()) {
|
||||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||||
}
|
}
|
||||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
} else {
|
} else {
|
||||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -157,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||||
system_prompt = prompt.substr(0, image_pos);
|
system_prompt = prompt.substr(0, image_pos);
|
||||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||||
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
|
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -178,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -189,21 +189,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
|
|
||||||
// generate the response
|
// generate the response
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
if (!ctx_sampling) {
|
if (!smpl) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string response = "";
|
std::string response = "";
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
printf("%s", tmp);
|
LOG("%s", tmp);
|
||||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||||
@ -211,8 +211,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
@ -223,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
|||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return model;
|
return model;
|
||||||
@ -246,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
if (ctx_llama == NULL) {
|
||||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
ctx_llava->ctx_llama = ctx_llama;
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
ctx_llava->ctx_clip = ctx_clip;
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
@ -269,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
gpt_init();
|
||||||
log_set_target(log_filename_generator("llava", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
print_usage(argc, argv, {});
|
print_usage(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto model = llava_init(¶ms);
|
|
||||||
|
auto * model = llava_init(¶ms);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prompt_contains_image(params.prompt)) {
|
if (prompt_contains_image(params.prompt)) {
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||||
|
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
} else {
|
} else {
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
|
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
llava_image_embed_free(image_embed);
|
llava_image_embed_free(image_embed);
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -1,13 +1,23 @@
|
|||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cerrno>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <numeric>
|
|
||||||
|
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||||
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
|
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
struct clip_image_u8 {
|
struct clip_image_u8 {
|
||||||
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
|||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int downscaled_height = static_cast<int>(original_height * scale);
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
int wasted_resolution = (width * height) - effective_resolution;
|
||||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_resolution = wasted_resolution;
|
||||||
@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||||
ggml_build_forward_expand(gf, flatten);
|
ggml_build_forward_expand(gf, flatten);
|
||||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||||
|
|
||||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||||
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
img_res_v.size = 0;
|
img_res_v.size = 0;
|
||||||
img_res_v.data = nullptr;
|
img_res_v.data = nullptr;
|
||||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
||||||
delete[] img_res_v.data;
|
delete[] img_res_v.data;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||||
}
|
}
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
int n_img_pos_out = 0;
|
int n_img_pos_out = 0;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
load_image_size->width = img->nx;
|
load_image_size->width = img->nx;
|
||||||
load_image_size->height = img->ny;
|
load_image_size->height = img->ny;
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
}
|
}
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||||
delete[] img_res_v.data;
|
delete[] img_res_v.data;
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image\n");
|
LOG_ERR("Unable to encode image\n");
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||||
|
|
||||||
@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
|
||||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|||||||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||||
if (n_image_embd != n_llama_embd) {
|
if (n_image_embd != n_llama_embd) {
|
||||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|||||||
}
|
}
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_img_pos;
|
int n_img_pos;
|
||||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||||||
}
|
}
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
clip_image_u8 * img = clip_image_u8_init();
|
clip_image_u8 * img = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
||||||
auto file = fopen(path, "rb");
|
auto file = fopen(path, "rb");
|
||||||
if (file == NULL) {
|
if (file == NULL) {
|
||||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
|||||||
|
|
||||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||||
perror("Memory allocation error");
|
perror("Memory allocation error");
|
||||||
fclose(file);
|
fclose(file);
|
||||||
return false;
|
return false;
|
||||||
@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
|||||||
long image_bytes_length;
|
long image_bytes_length;
|
||||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||||
if (!loaded) {
|
if (!loaded) {
|
||||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,18 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <iostream> // TODO: remove me
|
||||||
|
|
||||||
struct llava_context {
|
struct llava_context {
|
||||||
struct clip_ctx * ctx_clip = NULL;
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
@ -16,14 +21,8 @@ struct llava_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
@ -34,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
|||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return model;
|
return model;
|
||||||
@ -49,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
if (params->n_ctx < 2048) {
|
if (params->n_ctx < 2048) {
|
||||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||||
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||||
ctx_params.n_ctx = 2048;
|
ctx_params.n_ctx = 2048;
|
||||||
} else {
|
} else {
|
||||||
ctx_params.n_ctx = params->n_ctx;
|
ctx_params.n_ctx = params->n_ctx;
|
||||||
@ -58,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
if (ctx_llama == NULL) {
|
||||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
ctx_llava->ctx_llama = ctx_llama;
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
ctx_llava->model = model;
|
ctx_llava->model = model;
|
||||||
@ -87,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
|
|||||||
if (prompt.empty()) {
|
if (prompt.empty()) {
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
return ctx_clip;
|
return ctx_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -123,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
|
|||||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
|
|
||||||
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||||
slice_embed->embed = image_embed;
|
slice_embed->embed = image_embed;
|
||||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||||
@ -141,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
system_prompt = "<|im_start|>user\n";
|
system_prompt = "<|im_start|>user\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
@ -160,14 +159,14 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||||||
}
|
}
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct llama_sampling_context * ctx_sampling,
|
static const char * sample(struct gpt_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
||||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
@ -179,42 +178,42 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||||
auto ctx_clip = clip_init_context(params);
|
auto * ctx_clip = clip_init_context(params);
|
||||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embeds) {
|
if (!embeds) {
|
||||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the prompt
|
// process the prompt
|
||||||
if (params->prompt.empty() && params->interactive == false) {
|
if (params->prompt.empty() && params->interactive == false) {
|
||||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto model = llava_init(params);
|
auto * model = llava_init(params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||||
auto ctx_llava = llava_init_context(params, model);
|
auto * ctx_llava = llava_init_context(params, model);
|
||||||
ctx_llava->ctx_clip = ctx_clip;
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||||
|
|
||||||
const int64_t t_process_image_start_us = ggml_time_us();
|
const int64_t t_process_image_start_us = ggml_time_us();
|
||||||
process_image(ctx_llava, embeds, params, n_past);
|
process_image(ctx_llava, embeds, params, n_past);
|
||||||
const int64_t t_process_image_end_us = ggml_time_us();
|
const int64_t t_process_image_end_us = ggml_time_us();
|
||||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||||
|
|
||||||
llava_image_embed_free(embeds);
|
llava_image_embed_free(embeds);
|
||||||
return ctx_llava;
|
return ctx_llava;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||||
std::string user_prompt = prompt;
|
std::string user_prompt = prompt;
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||||
if (!is_first) {
|
if (!is_first) {
|
||||||
@ -236,15 +235,15 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
|
|||||||
|
|
||||||
// generate the response
|
// generate the response
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
return ctx_sampling;
|
return smpl;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
|
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
|
||||||
|
|
||||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||||
return tmp;
|
return tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,41 +252,36 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||||
show_additional_info(argc, argv);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
gpt_init();
|
||||||
log_set_target(log_filename_generator("llava", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty())) {
|
if (params.mmproj.empty() || (params.image.empty())) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||||
|
|
||||||
if (!params.prompt.empty()) {
|
if (!params.prompt.empty()) {
|
||||||
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
LOG("<user>%s\n", params.prompt.c_str());
|
||||||
LOG_TEE("<assistant>");
|
LOG("<assistant>");
|
||||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response;
|
||||||
bool have_tmp = false;
|
bool have_tmp = false;
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0){
|
if (strcmp(tmp, "</s>") == 0){
|
||||||
if(!have_tmp)continue;
|
if (!have_tmp) {
|
||||||
else break;
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
have_tmp = true;
|
have_tmp = true;
|
||||||
@ -296,18 +290,18 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}else {
|
}else {
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG_TEE("<user>");
|
LOG("<user>");
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::getline(std::cin, prompt);
|
std::getline(std::cin, prompt);
|
||||||
LOG_TEE("<assistant>");
|
LOG("<assistant>");
|
||||||
auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response;
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
|
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
@ -315,11 +309,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
llama_print_timings(ctx_llava->ctx_llama);
|
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||||
|
|
||||||
ctx_llava->model = NULL;
|
ctx_llava->model = NULL;
|
||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -37,23 +39,18 @@ struct ngram_container {
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int W = 15; // lookahead window
|
const int W = 15; // lookahead window
|
||||||
const int N = 5; // n-gram size
|
const int N = 5; // n-gram size
|
||||||
const int G = 15; // max verification n-grams
|
const int G = 15; // max verification n-grams
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("lookahead", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
|
|||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
// verification n-grams
|
// verification n-grams
|
||||||
std::vector<ngram_data> ngrams_cur(G);
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// sample first token
|
// sample first token
|
||||||
{
|
{
|
||||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
id = gpt_sampler_sample(smpl, ctx, 0);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// sample the next token
|
// sample the next token
|
||||||
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
id = gpt_sampler_sample(smpl, ctx, i_batch);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
// print
|
// print
|
||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
} else {
|
} else {
|
||||||
// print light cyan
|
// print light cyan
|
||||||
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
LOG("\033[0;96m%s\033[0m", token_str.c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
|
|||||||
// print known n-grams starting with token id (debug)
|
// print known n-grams starting with token id (debug)
|
||||||
if (0 && v == 0) {
|
if (0 && v == 0) {
|
||||||
if (ngrams_observed.cnt[id] > 0) {
|
if (ngrams_observed.cnt[id] > 0) {
|
||||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||||
printf(" - ngram %2d: ", i);
|
LOG(" - ngram %2d: ", i);
|
||||||
|
|
||||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||||
|
|
||||||
for (int j = 0; j < N - 1; j++) {
|
for (int j = 0; j < N - 1; j++) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// sample from the last level
|
// sample from the last level
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
auto t_dec_end = ggml_time_us();
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("W = %2d\n", W);
|
LOG_INF("W = %2d\n", W);
|
||||||
LOG_TEE("N = %2d\n", N);
|
LOG_INF("N = %2d\n", N);
|
||||||
LOG_TEE("G = %2d\n", G);
|
LOG_INF("G = %2d\n", G);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_INF("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG_INF("\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_kv_cache_view_free(&kvc_view);
|
llama_kv_cache_view_free(&kvc_view);
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -13,8 +14,7 @@
|
|||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
|||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
|
||||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,25 +1,26 @@
|
|||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
|
|||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
|
|||||||
const int64_t eta_min = eta_ms / (60*1000);
|
const int64_t eta_min = eta_ms / (60*1000);
|
||||||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
||||||
|
|
||||||
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||||
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
|
|||||||
ngram_cache_context.clear();
|
ngram_cache_context.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_INF("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
|
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,35 +1,31 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// max. number of additional tokens to draft if match is found
|
// max. number of additional tokens to draft if match is found
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("lookup", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -59,7 +55,7 @@ int main(int argc, char ** argv){
|
|||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -77,14 +73,14 @@ int main(int argc, char ** argv){
|
|||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
@ -106,7 +102,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
std::vector<llama_token> draft;
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
@ -125,19 +121,19 @@ int main(int argc, char ** argv){
|
|||||||
}
|
}
|
||||||
|
|
||||||
// print current draft sequence
|
// print current draft sequence
|
||||||
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
|
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||||
|
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
|
llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (!params.use_color) {
|
if (!params.use_color) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_is_eog(model, id)) {
|
if (llama_token_is_eog(model, id)) {
|
||||||
@ -148,7 +144,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
// check if the target token matches the draft
|
// check if the target token matches the draft
|
||||||
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
||||||
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||||
++n_accept;
|
++n_accept;
|
||||||
++n_past;
|
++n_past;
|
||||||
++i_dft;
|
++i_dft;
|
||||||
@ -162,19 +158,19 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
// color accepted draft token
|
// color accepted draft token
|
||||||
printf("\033[34m%s\033[0m", token_str.c_str());
|
LOG("\033[34m%s\033[0m", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
|
|
||||||
draft.clear();
|
draft.clear();
|
||||||
draft.push_back(id);
|
draft.push_back(id);
|
||||||
@ -225,25 +221,26 @@ int main(int argc, char ** argv){
|
|||||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_INF("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_INF("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n");
|
LOG_INF("\ntarget:\n\n");
|
||||||
llama_print_timings(ctx);
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
llama_batch_free(batch_tgt);
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite
|
|||||||
|
|
||||||
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
|
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
|
||||||
|
|
||||||
|
The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
|
||||||
|
|
||||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
||||||
|
|
||||||
### Temperature
|
### Temperature
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
@ -33,6 +33,7 @@
|
|||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
|
static gpt_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static gpt_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
@ -40,6 +41,15 @@ static std::vector<llama_token> * g_output_tokens;
|
|||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
static bool need_insert_eot = false;
|
static bool need_insert_eot = false;
|
||||||
|
|
||||||
|
static void print_usage(int argc, char ** argv) {
|
||||||
|
(void) argc;
|
||||||
|
|
||||||
|
LOG("\nexample usage:\n");
|
||||||
|
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||||
|
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||||
|
LOG("\n");
|
||||||
|
}
|
||||||
|
|
||||||
static bool file_exists(const std::string & path) {
|
static bool file_exists(const std::string & path) {
|
||||||
std::ifstream f(path.c_str());
|
std::ifstream f(path.c_str());
|
||||||
return f.good();
|
return f.good();
|
||||||
@ -65,8 +75,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
|
||||||
__func__, params.logdir.c_str());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,7 +83,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -92,7 +101,7 @@ static void write_logfile(
|
|||||||
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
||||||
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,8 +113,8 @@ static void sigint_handler(int signo) {
|
|||||||
need_insert_eot = true;
|
need_insert_eot = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
llama_print_timings(*g_ctx);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
@ -113,41 +122,24 @@ static void sigint_handler(int signo) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
|
||||||
llama_chat_msg new_msg{role, content};
|
llama_chat_msg new_msg{role, content};
|
||||||
auto formatted = llama_chat_format_single(
|
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
||||||
chat_msgs.push_back({role, content});
|
chat_msgs.push_back({role, content});
|
||||||
LOG("formatted: %s\n", formatted.c_str());
|
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampling_params & sparams = params.sparams;
|
gpt_init();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
auto & sparams = params.sparams;
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// TODO: Dump params ?
|
|
||||||
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
|
||||||
|
|
||||||
// save choice to use color for later
|
// save choice to use color for later
|
||||||
// (note for later: this is a slightly awkward choice)
|
// (note for later: this is a slightly awkward choice)
|
||||||
@ -155,115 +147,132 @@ int main(int argc, char ** argv) {
|
|||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
printf("\n************\n");
|
LOG_ERR("************\n");
|
||||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.embedding) {
|
if (params.embedding) {
|
||||||
printf("\n************\n");
|
LOG_ERR("************\n");
|
||||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_base != 0.0) {
|
if (params.rope_freq_base != 0.0) {
|
||||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_scale != 0.0) {
|
if (params.rope_freq_scale != 0.0) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
LOG_INF("%s: llama backend init\n", __func__);
|
||||||
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx;
|
llama_context * ctx = nullptr;
|
||||||
llama_context * ctx_guidance = NULL;
|
gpt_sampler * smpl = nullptr;
|
||||||
|
|
||||||
std::vector<llama_chat_msg> chat_msgs;
|
std::vector<llama_chat_msg> chat_msgs;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
if (sparams.cfg_scale > 1.f) {
|
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
|
||||||
ctx_guidance = llama_new_context_with_model(model, lparams);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||||
|
|
||||||
|
struct ggml_threadpool_params tpp_batch =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||||
|
struct ggml_threadpool_params tpp =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||||
|
|
||||||
|
set_process_priority(params.cpuparams.priority);
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool_batch = NULL;
|
||||||
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
|
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||||
|
if (!threadpool_batch) {
|
||||||
|
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the non-batch threadpool in the paused state
|
||||||
|
tpp.paused = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||||
__func__, n_ctx_train, n_ctx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// print chat template example in conversation mode
|
// print chat template example in conversation mode
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
if (params.enable_chat_template) {
|
if (params.enable_chat_template) {
|
||||||
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string path_session = params.path_prompt_cache;
|
std::string path_session = params.path_prompt_cache;
|
||||||
std::vector<llama_token> session_tokens;
|
std::vector<llama_token> session_tokens;
|
||||||
|
|
||||||
if (!path_session.empty()) {
|
if (!path_session.empty()) {
|
||||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||||
if (!file_exists(path_session)) {
|
if (!file_exists(path_session)) {
|
||||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
LOG_INF("%s: session file does not exist, will create.\n", __func__);
|
||||||
} else if (file_is_empty(path_session)) {
|
} else if (file_is_empty(path_session)) {
|
||||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
// The file exists and is not empty
|
// The file exists and is not empty
|
||||||
session_tokens.resize(n_ctx);
|
session_tokens.resize(n_ctx);
|
||||||
size_t n_token_count_out = 0;
|
size_t n_token_count_out = 0;
|
||||||
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
session_tokens.resize(n_token_count_out);
|
session_tokens.resize(n_token_count_out);
|
||||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,7 +280,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (!llama_model_has_encoder(model)) {
|
if (!llama_model_has_encoder(model)) {
|
||||||
GGML_ASSERT(!llama_add_eos_token(model));
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
}
|
}
|
||||||
LOG("add_bos: %d\n", add_bos);
|
|
||||||
|
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
@ -280,49 +290,31 @@ int main(int argc, char ** argv) {
|
|||||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
: params.prompt;
|
: params.prompt;
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG_DBG("tokenize the prompt\n");
|
||||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG_DBG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
if (add_bos) {
|
if (add_bos) {
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("error: input is empty\n");
|
LOG_ERR("input is empty\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
std::vector<llama_token> guidance_inp;
|
|
||||||
int guidance_offset = 0;
|
|
||||||
int original_prompt_len = 0;
|
|
||||||
if (ctx_guidance) {
|
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
|
||||||
|
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
|
||||||
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
|
||||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -336,29 +328,28 @@ int main(int argc, char ** argv) {
|
|||||||
n_matching_session_tokens++;
|
n_matching_session_tokens++;
|
||||||
}
|
}
|
||||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||||
LOG_TEE("%s: using full prompt from session file\n", __func__);
|
LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||||
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
|
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||||
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the previous session
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||||
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
|
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||||
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
|
||||||
|
|
||||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||||
// reevaluation of the last token to recalculate the cached logits
|
// reevaluation of the last token to recalculate the cached logits
|
||||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||||
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
|
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||||
|
|
||||||
session_tokens.resize(embd_inp.size() - 1);
|
session_tokens.resize(embd_inp.size() - 1);
|
||||||
}
|
}
|
||||||
@ -380,30 +371,20 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
LOG_TEE("\n");
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
LOG_TEE("\n");
|
|
||||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
|
||||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
|
||||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > add_bos) {
|
if (params.n_keep > add_bos) {
|
||||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("'\n");
|
LOG("'\n");
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ctrl+C handling
|
// ctrl+C handling
|
||||||
@ -423,47 +404,56 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
for (const auto & antiprompt : params.antiprompt) {
|
for (const auto & antiprompt : params.antiprompt) {
|
||||||
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG_TEE("Input prefix with BOS\n");
|
LOG("Input prefix with BOS\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
|
||||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
if (!smpl) {
|
||||||
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
|
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||||
|
|
||||||
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
// group-attention state
|
// group-attention state
|
||||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||||
@ -477,9 +467,9 @@ int main(int argc, char ** argv) {
|
|||||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||||
}
|
}
|
||||||
LOG_TEE("\n\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char * control_message;
|
const char * control_message;
|
||||||
@ -491,11 +481,11 @@ int main(int argc, char ** argv) {
|
|||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
" - To return control without starting a new line, end your input with '/'.\n"
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
" - If you want to submit another line, end your input with '\\'.\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("== Running in interactive mode. ==\n");
|
LOG("== Running in interactive mode. ==\n");
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||||
#endif
|
#endif
|
||||||
LOG_TEE( "%s\n", control_message);
|
LOG( "%s\n", control_message);
|
||||||
|
|
||||||
is_interacting = params.interactive_first;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
@ -509,7 +499,6 @@ int main(int argc, char ** argv) {
|
|||||||
int n_remain = params.n_predict;
|
int n_remain = params.n_predict;
|
||||||
int n_consumed = 0;
|
int n_consumed = 0;
|
||||||
int n_session_consumed = 0;
|
int n_session_consumed = 0;
|
||||||
int n_past_guidance = 0;
|
|
||||||
|
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
@ -521,7 +510,6 @@ int main(int argc, char ** argv) {
|
|||||||
display = params.display_prompt;
|
display = params.display_prompt;
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
std::vector<llama_token> embd_guidance;
|
|
||||||
|
|
||||||
// tokenized antiprompts
|
// tokenized antiprompts
|
||||||
std::vector<std::vector<llama_token>> antiprompt_ids;
|
std::vector<std::vector<llama_token>> antiprompt_ids;
|
||||||
@ -531,18 +519,12 @@ int main(int argc, char ** argv) {
|
|||||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
||||||
if (!ctx_sampling) {
|
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
int enc_input_size = embd_inp.size();
|
int enc_input_size = embd_inp.size();
|
||||||
llama_token * enc_input_buf = embd_inp.data();
|
llama_token * enc_input_buf = embd_inp.data();
|
||||||
|
|
||||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -568,9 +550,8 @@ int main(int argc, char ** argv) {
|
|||||||
embd.resize(max_embd_size);
|
embd.resize(max_embd_size);
|
||||||
|
|
||||||
console::set_display(console::error);
|
console::set_display(console::error);
|
||||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ga_n == 1) {
|
if (ga_n == 1) {
|
||||||
@ -578,33 +559,35 @@ int main(int argc, char ** argv) {
|
|||||||
// if we run out of context:
|
// if we run out of context:
|
||||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
|
|
||||||
if (params.n_predict == -2) {
|
if (n_past + (int) embd.size() >= n_ctx) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
if (!params.ctx_shift){
|
||||||
|
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
||||||
break;
|
break;
|
||||||
|
} else {
|
||||||
|
if (params.n_predict == -2) {
|
||||||
|
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_left = n_past - params.n_keep;
|
||||||
|
const int n_discard = n_left/2;
|
||||||
|
|
||||||
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||||
|
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||||
|
|
||||||
|
n_past -= n_discard;
|
||||||
|
|
||||||
|
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||||
|
|
||||||
|
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
|
LOG_DBG("clear session path\n");
|
||||||
|
path_session.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_left = n_past - params.n_keep;
|
|
||||||
const int n_discard = n_left/2;
|
|
||||||
|
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
||||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
|
||||||
|
|
||||||
n_past -= n_discard;
|
|
||||||
|
|
||||||
if (ctx_guidance) {
|
|
||||||
n_past_guidance -= n_discard;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
|
||||||
|
|
||||||
LOG("clear session path\n");
|
|
||||||
path_session.clear();
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// context extension via Self-Extend
|
// context extension via Self-Extend
|
||||||
@ -613,10 +596,10 @@ int main(int argc, char ** argv) {
|
|||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||||
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
||||||
|
|
||||||
LOG("\n");
|
LOG_DBG("\n");
|
||||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
||||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||||
|
|
||||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||||
@ -626,7 +609,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
ga_i += ga_w/ga_n;
|
ga_i += ga_w/ga_n;
|
||||||
|
|
||||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -652,65 +635,25 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate tokens in batches
|
|
||||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
||||||
if (ctx_guidance) {
|
|
||||||
int input_size = 0;
|
|
||||||
llama_token * input_buf = NULL;
|
|
||||||
|
|
||||||
if (n_past_guidance < (int) guidance_inp.size()) {
|
|
||||||
// Guidance context should have the same data with these modifications:
|
|
||||||
//
|
|
||||||
// * Replace the initial prompt
|
|
||||||
// * Shift everything by guidance_offset
|
|
||||||
embd_guidance = guidance_inp;
|
|
||||||
if (embd.begin() + original_prompt_len < embd.end()) {
|
|
||||||
embd_guidance.insert(
|
|
||||||
embd_guidance.end(),
|
|
||||||
embd.begin() + original_prompt_len,
|
|
||||||
embd.end()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
input_buf = embd_guidance.data();
|
|
||||||
input_size = embd_guidance.size();
|
|
||||||
|
|
||||||
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
|
||||||
} else {
|
|
||||||
input_buf = embd.data();
|
|
||||||
input_size = embd.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
|
||||||
int n_eval = std::min(input_size - i, params.n_batch);
|
|
||||||
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
n_past_guidance += n_eval;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||||
int n_eval = (int) embd.size() - i;
|
int n_eval = (int) embd.size() - i;
|
||||||
if (n_eval > params.n_batch) {
|
if (n_eval > params.n_batch) {
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
|
|
||||||
LOG("n_past = %d\n", n_past);
|
LOG_DBG("n_past = %d\n", n_past);
|
||||||
// Display total tokens alongside total time
|
// Display total tokens alongside total time
|
||||||
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
||||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -721,7 +664,6 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
embd.clear();
|
embd.clear();
|
||||||
embd_guidance.clear();
|
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
// optionally save the session on first sample (for faster prompt loading next time)
|
// optionally save the session on first sample (for faster prompt loading next time)
|
||||||
@ -729,14 +671,14 @@ int main(int argc, char ** argv) {
|
|||||||
need_to_save_session = false;
|
need_to_save_session = false;
|
||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
|
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG_DBG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
||||||
|
|
||||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -746,16 +688,16 @@ int main(int argc, char ** argv) {
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
// some user input remains from prompt or interaction, forward it to processing
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
@ -770,7 +712,7 @@ int main(int argc, char ** argv) {
|
|||||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||||
|
|
||||||
// Console/Stream Output
|
// Console/Stream Output
|
||||||
fprintf(stdout, "%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
|
||||||
// Record Displayed Tokens To Log
|
// Record Displayed Tokens To Log
|
||||||
// Note: Generated tokens are created one by one hence this check
|
// Note: Generated tokens are created one by one hence this check
|
||||||
@ -782,8 +724,6 @@ int main(int argc, char ** argv) {
|
|||||||
output_tokens.push_back(id);
|
output_tokens.push_back(id);
|
||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -798,7 +738,7 @@ int main(int argc, char ** argv) {
|
|||||||
// check for reverse prompt in the last n_prev tokens
|
// check for reverse prompt in the last n_prev tokens
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
const int n_prev = 32;
|
const int n_prev = 32;
|
||||||
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
|
const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
|
||||||
|
|
||||||
is_antiprompt = false;
|
is_antiprompt = false;
|
||||||
// Check if each of the reverse prompts appears at the end of the output.
|
// Check if each of the reverse prompts appears at the end of the output.
|
||||||
@ -820,7 +760,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// check for reverse prompt using special tokens
|
// check for reverse prompt using special tokens
|
||||||
llama_token last_token = llama_sampling_last(ctx_sampling);
|
llama_token last_token = gpt_sampler_last(smpl);
|
||||||
for (std::vector<llama_token> ids : antiprompt_ids) {
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||||
if (ids.size() == 1 && last_token == ids[0]) {
|
if (ids.size() == 1 && last_token == ids[0]) {
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -832,13 +772,13 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (is_antiprompt) {
|
if (is_antiprompt) {
|
||||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found an EOG token\n");
|
LOG_DBG("found an EOG token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
@ -852,32 +792,32 @@ int main(int argc, char ** argv) {
|
|||||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||||
}
|
}
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if current token is not EOG, we add it to current assistant message
|
// if current token is not EOG, we add it to current assistant message
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
auto id = llama_sampling_last(ctx_sampling);
|
const auto id = gpt_sampler_last(smpl);
|
||||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG_DBG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
printf("\n> ");
|
LOG("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG("adding input prefix BOS token\n");
|
LOG_DBG("adding input prefix BOS token\n");
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty() && !params.conversation) {
|
if (!params.input_prefix.empty() && !params.conversation) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
printf("%s", params.input_prefix.c_str());
|
LOG("%s", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// color user input only
|
// color user input only
|
||||||
@ -900,11 +840,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty() && !params.conversation) {
|
if (!params.input_suffix.empty() && !params.conversation) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
printf("%s", params.input_suffix.c_str());
|
LOG("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("buffer: '%s'\n", buffer.c_str());
|
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
@ -921,7 +861,7 @@ int main(int argc, char ** argv) {
|
|||||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||||
if (need_insert_eot && format_chat) {
|
if (need_insert_eot && format_chat) {
|
||||||
@ -944,9 +884,9 @@ int main(int argc, char ** argv) {
|
|||||||
assistant_ss.str("");
|
assistant_ss.str("");
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
LOG("empty line, passing control back\n");
|
LOG_DBG("empty line, passing control back\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
input_echo = false; // do not echo this again
|
input_echo = false; // do not echo this again
|
||||||
@ -954,7 +894,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
llama_sampling_reset(ctx_sampling);
|
gpt_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
@ -962,7 +902,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// end of generation
|
// end of generation
|
||||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -975,23 +915,23 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n\n");
|
||||||
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
gpt_sampler_free(smpl);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_sampling_free(ctx_sampling);
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
ggml_threadpool_free(threadpool);
|
||||||
LOG_TEE("Log end\n");
|
ggml_threadpool_free(threadpool_batch);
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submit requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -50,8 +53,8 @@ static std::vector<std::string> k_prompts = {
|
|||||||
|
|
||||||
struct client {
|
struct client {
|
||||||
~client() {
|
~client() {
|
||||||
if (ctx_sampling) {
|
if (smpl) {
|
||||||
llama_sampling_free(ctx_sampling);
|
gpt_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +75,7 @@ struct client {
|
|||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string response;
|
std::string response;
|
||||||
|
|
||||||
struct llama_sampling_context * ctx_sampling = nullptr;
|
struct gpt_sampler * smpl = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_date_time() {
|
static void print_date_time() {
|
||||||
@ -81,7 +84,9 @@ static void print_date_time() {
|
|||||||
char buffer[80];
|
char buffer[80];
|
||||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||||
|
|
||||||
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
LOG_INF("\n");
|
||||||
|
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define a split string function to ...
|
// Define a split string function to ...
|
||||||
@ -100,11 +105,12 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// number of simultaneous "clients" to simulate
|
// number of simultaneous "clients" to simulate
|
||||||
const int32_t n_clients = params.n_parallel;
|
const int32_t n_clients = params.n_parallel;
|
||||||
|
|
||||||
@ -119,12 +125,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("parallel", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -137,23 +137,22 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
} else {
|
} else {
|
||||||
// Output each line of the input params.prompts vector and copy to k_prompts
|
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||||
int index = 0;
|
int index = 0;
|
||||||
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||||
|
|
||||||
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||||
for (const auto& prompt : prompts) {
|
for (const auto& prompt : prompts) {
|
||||||
k_prompts.resize(index + 1);
|
k_prompts.resize(index + 1);
|
||||||
k_prompts[index] = prompt;
|
k_prompts[index] = prompt;
|
||||||
index++;
|
index++;
|
||||||
printf("%3d prompt: %s\n", index, prompt.c_str());
|
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG_INF("\n\n");
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
@ -161,7 +160,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = i;
|
client.id = i;
|
||||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
client.smpl = gpt_sampler_init(model, params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
@ -182,19 +181,19 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -203,10 +202,10 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_INF("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
@ -237,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// insert new sequences for decoding
|
// insert new sequences for decoding
|
||||||
@ -253,7 +252,7 @@ int main(int argc, char ** argv) {
|
|||||||
client.prompt = client.input + "\nAssistant:";
|
client.prompt = client.input + "\nAssistant:";
|
||||||
client.response = "";
|
client.response = "";
|
||||||
|
|
||||||
llama_sampling_reset(client.ctx_sampling);
|
gpt_sampler_reset(client.smpl);
|
||||||
|
|
||||||
// do not prepend BOS because we have a system prompt!
|
// do not prepend BOS because we have a system prompt!
|
||||||
std::vector<llama_token> tokens_prompt;
|
std::vector<llama_token> tokens_prompt;
|
||||||
@ -272,7 +271,7 @@ int main(int argc, char ** argv) {
|
|||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
@ -316,11 +315,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||||
|
|
||||||
n_cache_miss += 1;
|
n_cache_miss += 1;
|
||||||
|
|
||||||
@ -331,7 +330,7 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||||
|
|
||||||
for (auto & client : clients) {
|
for (auto & client : clients) {
|
||||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||||
@ -341,9 +340,9 @@ int main(int argc, char ** argv) {
|
|||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
|
||||||
|
|
||||||
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
|
gpt_sampler_accept(client.smpl, id, true);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
@ -371,12 +370,12 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
||||||
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
|
||||||
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||||
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||||
(t_main_end - client.t_start_prompt) / 1e6,
|
(t_main_end - client.t_start_prompt) / 1e6,
|
||||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||||
@ -399,21 +398,22 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
print_date_time();
|
print_date_time();
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
if (params.prompt_file.empty()) {
|
if (params.prompt_file.empty()) {
|
||||||
params.prompt_file = "used built-in defaults";
|
params.prompt_file = "used built-in defaults";
|
||||||
}
|
}
|
||||||
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||||
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||||
|
|
||||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
LOG_INF("Cache misses: %6d\n", n_cache_miss);
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
// TODO: print sampling/grammar timings for all clients
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
@ -422,7 +422,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -6,12 +8,10 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
LOG("\nexample usage:\n");
|
||||||
|
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\n");
|
||||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
|
||||||
LOG_TEE("\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -21,12 +21,11 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_keep = 32;
|
params.n_keep = 32;
|
||||||
params.i_pos = -1;
|
params.i_pos = -1;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
gpt_init();
|
||||||
|
|
||||||
int n_junk = params.n_junk;
|
int n_junk = params.n_junk;
|
||||||
int n_keep = params.n_keep;
|
int n_keep = params.n_keep;
|
||||||
@ -67,7 +66,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -80,12 +79,17 @@ int main(int argc, char ** argv) {
|
|||||||
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
@ -106,14 +110,14 @@ int main(int argc, char ** argv) {
|
|||||||
const int n_batch = ctx_params.n_batch;
|
const int n_batch = ctx_params.n_batch;
|
||||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
|
LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
|
||||||
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
LOG_INF("prompt tokens: %d\n", n_tokens_all);
|
||||||
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
//LOG_INF("prompt: %s\n", params.prompt.c_str());
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
||||||
|
|
||||||
@ -144,11 +148,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_INF("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||||
|
|
||||||
if (i + n_batch >= n_tokens_all) {
|
if (i + n_batch >= n_tokens_all) {
|
||||||
break;
|
break;
|
||||||
@ -158,7 +162,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
||||||
const int n_discard = n_batch;
|
const int n_discard = n_batch;
|
||||||
|
|
||||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
@ -178,18 +182,18 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int n_discard = n_past - n_ctx + n_predict;
|
const int n_discard = n_past - n_ctx + n_predict;
|
||||||
|
|
||||||
if (n_discard > 0) {
|
if (n_discard > 0) {
|
||||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
@ -200,47 +204,32 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
// main loop
|
// main loop
|
||||||
|
|
||||||
int n_cur = n_tokens_all;
|
int n_cur = n_tokens_all;
|
||||||
int n_decode = 0;
|
int n_decode = 0;
|
||||||
|
|
||||||
LOG_TEE("%s", prompt_suffix.c_str());
|
LOG_INF("%s", prompt_suffix.c_str());
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
while (n_cur <= n_len) {
|
while (n_cur <= n_len) {
|
||||||
// sample the next token
|
// sample the next token
|
||||||
{
|
{
|
||||||
auto n_vocab = llama_n_vocab(model);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||||
auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
|
|
||||||
// sample the most likely token
|
|
||||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
|
|
||||||
@ -255,21 +244,24 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
// evaluate the current batch with the transformer model
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
@ -1,18 +1,21 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <atomic>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <random>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
|
||||||
#include <atomic>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <array>
|
|
||||||
#include <fstream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
@ -40,7 +43,7 @@ static void write_logfile(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.hellaswag) {
|
if (params.hellaswag) {
|
||||||
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,7 +51,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -57,7 +60,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,7 +79,7 @@ static void write_logfile(
|
|||||||
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
||||||
yaml_dump_vector_float(logfile, "probs", results.probs);
|
yaml_dump_vector_float(logfile, "probs", results.probs);
|
||||||
|
|
||||||
llama_dump_timing_info_yaml(logfile, ctx);
|
llama_perf_dump_yaml(logfile, ctx);
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -343,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||||
n_ctx);
|
n_ctx);
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||||
return {std::move(tokens), 0., {}, {}};
|
return {std::move(tokens), 0., {}, {}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
prob_history.resize(tokens.size());
|
prob_history.resize(tokens.size());
|
||||||
|
|
||||||
if (params.ppl_stride <= 0) {
|
if (params.ppl_stride <= 0) {
|
||||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
const int calc_chunk = n_ctx;
|
const int calc_chunk = n_ctx;
|
||||||
|
|
||||||
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||||
|
|
||||||
if (int(tokens.size()) <= calc_chunk) {
|
if (int(tokens.size()) <= calc_chunk) {
|
||||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||||
tokens.size(), n_ctx, params.ppl_stride);
|
tokens.size(), n_ctx, params.ppl_stride);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * params.ppl_stride;
|
const int start = i * params.ppl_stride;
|
||||||
const int end = start + calc_chunk;
|
const int end = start + calc_chunk;
|
||||||
|
|
||||||
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||||
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
|
|
||||||
@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
const int batch_size = std::min(end - batch_start, n_batch);
|
const int batch_size = std::min(end - batch_start, n_batch);
|
||||||
|
|
||||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
//LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -433,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||||
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
||||||
|
|
||||||
// Calculate probability of next token, given the previous ones.
|
// Calculate probability of next token, given the previous ones.
|
||||||
@ -459,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
}
|
}
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||||
}
|
}
|
||||||
@ -487,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
||||||
if (!logits_stream.is_open()) {
|
if (!logits_stream.is_open()) {
|
||||||
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||||
logits_stream.write("_logits_", 8);
|
logits_stream.write("_logits_", 8);
|
||||||
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||||
n_ctx);
|
n_ctx);
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||||
return {std::move(tokens), 0., {}, {}};
|
return {std::move(tokens), 0., {}, {}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -539,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
logits.reserve((size_t)n_ctx * n_vocab);
|
logits.reserve((size_t)n_ctx * n_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
@ -612,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_INF("%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -627,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
for (int seq = 0; seq < n_seq_batch; seq++) {
|
for (int seq = 0; seq < n_seq_batch; seq++) {
|
||||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
|
||||||
@ -655,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
|
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
double av = nll/count;
|
double av = nll/count;
|
||||||
double av2 = nll2/count - av*av;
|
double av2 = nll2/count - av*av;
|
||||||
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
nll2 /= count;
|
nll2 /= count;
|
||||||
nll /= count;
|
nll /= count;
|
||||||
@ -675,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
nll2 -= nll * nll;
|
nll2 -= nll * nll;
|
||||||
if (nll2 > 0) {
|
if (nll2 > 0) {
|
||||||
nll2 = sqrt(nll2/(count-1));
|
nll2 = sqrt(nll2/(count-1));
|
||||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
} else {
|
} else {
|
||||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
@ -703,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -789,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (prompt_lines.size() % 6 != 0) {
|
if (prompt_lines.size() % 6 != 0) {
|
||||||
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t hs_task_count = prompt_lines.size()/6;
|
size_t hs_task_count = prompt_lines.size()/6;
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
LOG_INF("================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// The tasks should be randomized so the score stabilizes quickly.
|
// The tasks should be randomized so the score stabilizes quickly.
|
||||||
bool randomize_tasks = true;
|
bool randomize_tasks = true;
|
||||||
@ -824,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
std::vector<llama_token> seq_tokens[4];
|
std::vector<llama_token> seq_tokens[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||||
|
|
||||||
// Select and read data from prompt lines
|
// Select and read data from prompt lines
|
||||||
std::vector<hs_data_t> hs_data(hs_task_count);
|
std::vector<hs_data_t> hs_data(hs_task_count);
|
||||||
@ -870,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
printf("\ntask\tacc_norm\n");
|
LOG("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
|
|
||||||
@ -940,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -948,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -998,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||||
|
|
||||||
// If the gold ending got the maximum logprobe add one accuracy point
|
// If the gold ending got the maximum logprobe add one accuracy point
|
||||||
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
||||||
@ -1006,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Print the accumulated accuracy mean x 100
|
// Print the accumulated accuracy mean x 100
|
||||||
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
@ -1015,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct winogrande_entry {
|
struct winogrande_entry {
|
||||||
@ -1059,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ipos != 4) {
|
if (ipos != 4) {
|
||||||
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
||||||
@ -1073,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|||||||
if (sentence[where] == '_') break;
|
if (sentence[where] == '_') break;
|
||||||
}
|
}
|
||||||
if (where == int(sentence.size())) {
|
if (where == int(sentence.size())) {
|
||||||
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
std::istringstream stream(answer.c_str());
|
std::istringstream stream(answer.c_str());
|
||||||
int i_answer; stream >> i_answer;
|
int i_answer; stream >> i_answer;
|
||||||
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
||||||
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
result.emplace_back();
|
result.emplace_back();
|
||||||
@ -1108,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
auto data = load_winogrande_from_csv(params.prompt);
|
auto data = load_winogrande_from_csv(params.prompt);
|
||||||
if (data.empty()) {
|
if (data.empty()) {
|
||||||
fprintf(stderr, "%s: no tasks\n", __func__);
|
LOG_ERR("%s: no tasks\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||||
|
|
||||||
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
||||||
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||||
std::mt19937 rng(1);
|
std::mt19937 rng(1);
|
||||||
std::vector<int> aux(data.size());
|
std::vector<int> aux(data.size());
|
||||||
for (int i = 0; i < int(data.size()); ++i) {
|
for (int i = 0; i < int(data.size()); ++i) {
|
||||||
@ -1133,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
data = std::move(selected);
|
data = std::move(selected);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
@ -1156,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
@ -1217,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1225,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1285,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
++n_done;
|
++n_done;
|
||||||
|
|
||||||
// print the accumulated accuracy mean x 100
|
// print the accumulated accuracy mean x 100
|
||||||
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (n_done < 100) return;
|
if (n_done < 100) return;
|
||||||
|
|
||||||
const float p = 1.f*n_correct/n_done;
|
const float p = 1.f*n_correct/n_done;
|
||||||
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
|
||||||
|
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool deserialize_string(std::istream & in, std::string & str) {
|
static bool deserialize_string(std::istream & in, std::string & str) {
|
||||||
@ -1347,7 +1349,7 @@ struct multiple_choice_task {
|
|||||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1355,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|||||||
for (auto& answer : task.mc1.answers) {
|
for (auto& answer : task.mc1.answers) {
|
||||||
if (answer.empty()) {
|
if (answer.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found empty answer\n", __func__);
|
LOG_ERR("%s: found empty answer\n", __func__);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1409,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
uint32_t n_task;
|
uint32_t n_task;
|
||||||
strstream.read((char *)&n_task, sizeof(n_task));
|
strstream.read((char *)&n_task, sizeof(n_task));
|
||||||
if (strstream.fail() || n_task == 0) {
|
if (strstream.fail() || n_task == 0) {
|
||||||
printf("%s: no tasks\n", __func__);
|
LOG_ERR("%s: no tasks\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
|
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
|
||||||
std::vector<uint32_t> task_pos(n_task);
|
std::vector<uint32_t> task_pos(n_task);
|
||||||
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
||||||
if (strstream.fail()) {
|
if (strstream.fail()) {
|
||||||
printf("%s: failed to read task positions from prompt\n", __func__);
|
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1424,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
||||||
// Use all tasks
|
// Use all tasks
|
||||||
tasks.resize(n_task);
|
tasks.resize(n_task);
|
||||||
printf("%s: reading tasks", __func__);
|
LOG_INF("%s: reading tasks", __func__);
|
||||||
int n_dot = std::max((int) n_task/100, 1);
|
int n_dot = std::max((int) n_task/100, 1);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (auto& task : tasks) {
|
for (auto& task : tasks) {
|
||||||
++i;
|
++i;
|
||||||
if (!task.deserialize(strstream)) {
|
if (!task.deserialize(strstream)) {
|
||||||
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i%n_dot == 0) printf(".");
|
if (i%n_dot == 0) LOG(".");
|
||||||
}
|
}
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
||||||
std::mt19937 rng(1);
|
std::mt19937 rng(1);
|
||||||
std::vector<int> aux(n_task);
|
std::vector<int> aux(n_task);
|
||||||
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
||||||
@ -1451,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
aux.pop_back();
|
aux.pop_back();
|
||||||
strstream.seekg(task_pos[idx], std::ios::beg);
|
strstream.seekg(task_pos[idx], std::ios::beg);
|
||||||
if (!task.deserialize(strstream)) {
|
if (!task.deserialize(strstream)) {
|
||||||
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n_task = params.multiple_choice_tasks;
|
n_task = params.multiple_choice_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: preparing task data", __func__);
|
LOG_INF("%s: preparing task data", __func__);
|
||||||
fflush(stdout);
|
|
||||||
if (n_task > 500) {
|
if (n_task > 500) {
|
||||||
printf("...");
|
LOG("...");
|
||||||
fflush(stdout);
|
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
std::atomic<int> n_bad(0);
|
std::atomic<int> n_bad(0);
|
||||||
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||||
@ -1486,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
for (auto& w : workers) w = std::thread(prepare);
|
for (auto& w : workers) w = std::thread(prepare);
|
||||||
prepare();
|
prepare();
|
||||||
for (auto& w : workers) w.join();
|
for (auto& w : workers) w.join();
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
fflush(stdout);
|
|
||||||
int nbad = n_bad;
|
int nbad = n_bad;
|
||||||
if (nbad > 0) {
|
if (nbad > 0) {
|
||||||
printf("%s: found %d malformed tasks\n", __func__, nbad);
|
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -1502,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i_task%n_dot == 0) {
|
if (i_task%n_dot == 0) {
|
||||||
printf(".");
|
LOG(".");
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
||||||
|
|
||||||
printf("\ntask\tacc_norm\n");
|
LOG("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
@ -1590,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1598,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1622,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
// compute the logprobs for each ending of the decoded tasks
|
// compute the logprobs for each ending of the decoded tasks
|
||||||
for (size_t i = i0; i < i1; ++i) {
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
auto & cur_task = tasks[i];
|
auto & cur_task = tasks[i];
|
||||||
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
||||||
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
||||||
// if (cur_task.mc1.labels[j] == 1) {
|
// if (cur_task.mc1.labels[j] == 1) {
|
||||||
// printf("%d", j+1);
|
// LOG("%d", j+1);
|
||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
|
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
||||||
|
|
||||||
// get the logits of the last token of the common prefix
|
// get the logits of the last token of the common prefix
|
||||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
||||||
@ -1640,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
size_t count = 1;
|
size_t count = 1;
|
||||||
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
||||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||||
//printf(" %zu %g\n", ir, eval_results[ir]);
|
//LOG(" %zu %g\n", ir, eval_results[ir]);
|
||||||
++count;
|
++count;
|
||||||
log_prob += eval_results[ir++];
|
log_prob += eval_results[ir++];
|
||||||
}
|
}
|
||||||
cur_task.log_probs[s] = log_prob / count;
|
cur_task.log_probs[s] = log_prob / count;
|
||||||
//printf(" Final: %g\n", log_prob / count);
|
//LOG(" Final: %g\n", log_prob / count);
|
||||||
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the ending with maximum logprob
|
// Find the ending with maximum logprob
|
||||||
@ -1666,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
++n_done;
|
++n_done;
|
||||||
|
|
||||||
// Print the accumulated accuracy mean x 100
|
// Print the accumulated accuracy mean x 100
|
||||||
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
@ -1679,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
|
|
||||||
float p = 1.f*n_correct/n_done;
|
float p = 1.f*n_correct/n_done;
|
||||||
float sigma = sqrt(p*(1-p)/(n_done-1));
|
float sigma = sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
LOG("\n");
|
||||||
|
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||||
p = 1.f*n_done/n_tot_answers;
|
p = 1.f*n_done/n_tot_answers;
|
||||||
sigma = sqrt(p*(1-p)/(n_done-1));
|
sigma = sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||||
|
|
||||||
printf("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
if (params.logits_file.empty()) {
|
if (params.logits_file.empty()) {
|
||||||
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
||||||
if (!in) {
|
if (!in) {
|
||||||
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
char check[9]; check[8] = 0;
|
char check[9]; check[8] = 0;
|
||||||
in.read(check, 8);
|
in.read(check, 8);
|
||||||
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
||||||
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1709,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
uint32_t n_ctx;
|
uint32_t n_ctx;
|
||||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||||
if (n_ctx > llama_n_ctx(ctx)) {
|
if (n_ctx > llama_n_ctx(ctx)) {
|
||||||
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1717,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
in.read((char *)&n_vocab, sizeof(n_vocab));
|
in.read((char *)&n_vocab, sizeof(n_vocab));
|
||||||
in.read((char *)&n_chunk, sizeof(n_chunk));
|
in.read((char *)&n_chunk, sizeof(n_chunk));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
||||||
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
||||||
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
||||||
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1775,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
||||||
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
|
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1796,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1813,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
|
|
||||||
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
||||||
|
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
@ -1831,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
p_diff_ptr += n_ctx - 1 - first;
|
p_diff_ptr += n_ctx - 1 - first;
|
||||||
kld_ptr += n_ctx - 1 - first;
|
kld_ptr += n_ctx - 1 - first;
|
||||||
|
|
||||||
printf("%4d", i+1);
|
LOG("%4d", i+1);
|
||||||
|
|
||||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||||
const double ppl_val = exp(log_ppl.first);
|
const double ppl_val = exp(log_ppl.first);
|
||||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||||
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
||||||
|
|
||||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||||
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||||
|
|
||||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||||
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
||||||
|
|
||||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||||
|
|
||||||
double p_top_val = 1.*kld.n_same_top/kld.count;
|
double p_top_val = 1.*kld.n_same_top/kld.count;
|
||||||
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
||||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
||||||
|
|
||||||
std::sort(kld_values.begin(), kld_values.end());
|
std::sort(kld_values.begin(), kld_values.end());
|
||||||
std::sort(p_diff_values.begin(), p_diff_values.end());
|
std::sort(p_diff_values.begin(), p_diff_values.end());
|
||||||
|
|
||||||
printf("====== Perplexity statistics ======\n");
|
LOG("====== Perplexity statistics ======\n");
|
||||||
|
|
||||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||||
const double ppl_val = exp(log_ppl.first);
|
const double ppl_val = exp(log_ppl.first);
|
||||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||||
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
||||||
|
|
||||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||||
const double ppl_base_val = exp(log_ppl_base.first);
|
const double ppl_base_val = exp(log_ppl_base.first);
|
||||||
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
||||||
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
||||||
|
|
||||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||||
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
||||||
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
||||||
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
||||||
|
|
||||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||||
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||||
|
|
||||||
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
||||||
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
||||||
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
||||||
|
|
||||||
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
||||||
const double ppl_diff_val = ppl_val - ppl_base_val;
|
const double ppl_diff_val = ppl_val - ppl_base_val;
|
||||||
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
||||||
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
printf("====== KL divergence statistics ======\n");
|
LOG("====== KL divergence statistics ======\n");
|
||||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||||
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
||||||
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
||||||
: kld_values[kld_values.size()/2];
|
: kld_values[kld_values.size()/2];
|
||||||
|
|
||||||
@ -1915,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
||||||
};
|
};
|
||||||
|
|
||||||
printf("Maximum KLD: %10.6f\n", kld_values.back());
|
LOG("Maximum KLD: %10.6f\n", kld_values.back());
|
||||||
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
||||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||||
printf("Median KLD: %10.6f\n", kld_median);
|
LOG("Median KLD: %10.6f\n", kld_median);
|
||||||
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
||||||
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
||||||
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
||||||
printf("Minimum KLD: %10.6f\n", kld_values.front());
|
LOG("Minimum KLD: %10.6f\n", kld_values.front());
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
printf("====== Token probability statistics ======\n");
|
LOG("====== Token probability statistics ======\n");
|
||||||
|
|
||||||
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
||||||
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
||||||
|
|
||||||
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
||||||
: p_diff_values[p_diff_values.size()/2];
|
: p_diff_values[p_diff_values.size()/2];
|
||||||
|
|
||||||
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
||||||
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
||||||
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
||||||
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
||||||
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
||||||
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
||||||
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
||||||
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
||||||
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
||||||
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
||||||
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
||||||
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
||||||
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
||||||
|
|
||||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||||
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
||||||
|
|
||||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||||
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||||
|
|
||||||
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
||||||
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -1967,15 +1962,16 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int32_t n_ctx = params.n_ctx;
|
const int32_t n_ctx = params.n_ctx;
|
||||||
|
|
||||||
if (n_ctx <= 0) {
|
if (n_ctx <= 0) {
|
||||||
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2000,21 +1996,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.ppl_stride > 0) {
|
if (params.ppl_stride > 0) {
|
||||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||||
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||||
params.n_ctx += params.ppl_stride/2;
|
params.n_ctx += params.ppl_stride/2;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
||||||
params.seed = time(NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -2024,21 +2010,21 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
|
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, params.n_ctx);
|
__func__, n_ctx_train, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
struct results_perplexity results;
|
struct results_perplexity results;
|
||||||
@ -2054,7 +2040,9 @@ int main(int argc, char ** argv) {
|
|||||||
results = perplexity(ctx, params, n_ctx);
|
results = perplexity(ctx, params, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
write_logfile(ctx, params, model, results);
|
write_logfile(ctx, params, model, results);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
#define LLAMA_API_INTERNAL
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "llama-impl.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -319,8 +319,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
cparams.n_ctx = 256;
|
cparams.n_ctx = 256;
|
||||||
cparams.seed = 1;
|
|
||||||
|
|
||||||
ctx = llama_new_context_with_model(model, cparams);
|
ctx = llama_new_context_with_model(model, cparams);
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
set(TARGET llama-quantize)
|
set(TARGET llama-quantize)
|
||||||
add_executable(${TARGET} quantize.cpp)
|
add_executable(${TARGET} quantize.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
|||||||
|
|
||||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||||
|
|
||||||
|
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
|
||||||
|
|
||||||
*(outdated)*
|
*(outdated)*
|
||||||
|
|
||||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||||
|
@ -26,6 +26,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
|
{ "TQ1_0", LLAMA_FTYPE_MOSTLY_TQ1_0, " 1.69 bpw ternarization", },
|
||||||
|
{ "TQ2_0", LLAMA_FTYPE_MOSTLY_TQ2_0, " 2.06 bpw ternarization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
|
||||||
@ -104,7 +106,7 @@ static void usage(const char * executable) {
|
|||||||
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
|
||||||
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
|
||||||
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
|
||||||
printf(" --keep-split: will generate quatized model in the same shards as input");
|
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
|
||||||
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
|
||||||
|
@ -1,15 +1,16 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <iostream> // TODO: remove me
|
||||||
|
|
||||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
static void print_usage(int, char ** argv) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
LOG("\nexample usage:\n");
|
||||||
|
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\n");
|
||||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
|
||||||
LOG_TEE("\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct chunk {
|
struct chunk {
|
||||||
@ -18,7 +19,7 @@ struct chunk {
|
|||||||
// original file position
|
// original file position
|
||||||
size_t filepos;
|
size_t filepos;
|
||||||
// original text data
|
// original text data
|
||||||
std::string textdata = "";
|
std::string textdata;
|
||||||
// tokenized text data
|
// tokenized text data
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
// embedding
|
// embedding
|
||||||
@ -32,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
|||||||
std::ifstream f(filename.c_str());
|
std::ifstream f(filename.c_str());
|
||||||
|
|
||||||
if (!f.is_open()) {
|
if (!f.is_open()) {
|
||||||
fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
|
LOG_ERR("could not open file %s\n", filename.c_str());
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
chunk current_chunk;
|
chunk current_chunk;
|
||||||
char buffer[1024];
|
char buffer[1024];
|
||||||
int64_t filepos = 0;
|
int64_t filepos = 0;
|
||||||
std::string current = "";
|
std::string current;
|
||||||
while (f.read(buffer, 1024)) {
|
while (f.read(buffer, 1024)) {
|
||||||
current += std::string(buffer, f.gcount());
|
current += std::string(buffer, f.gcount());
|
||||||
size_t pos;
|
size_t pos;
|
||||||
@ -85,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
LOG_ERR("%s : failed to decode\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
@ -100,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
embd = llama_get_embeddings_ith(ctx, i);
|
embd = llama_get_embeddings_ith(ctx, i);
|
||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -113,29 +114,28 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||||
print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// For BERT models, batch size must be equal to ubatch size
|
// For BERT models, batch size must be equal to ubatch size
|
||||||
params.n_ubatch = params.n_batch;
|
params.n_ubatch = params.n_batch;
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
if (params.chunk_size <= 0) {
|
if (params.chunk_size <= 0) {
|
||||||
fprintf(stderr, "chunk_size must be positive\n");
|
LOG_ERR("chunk_size must be positive\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (params.context_files.empty()) {
|
if (params.context_files.empty()) {
|
||||||
fprintf(stderr, "context_files must be specified\n");
|
LOG_ERR("context_files must be specified\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
LOG_INF("processing files:\n");
|
||||||
|
|
||||||
printf("processing files:\n");
|
|
||||||
for (auto & context_file : params.context_files) {
|
for (auto & context_file : params.context_files) {
|
||||||
printf("%s\n", context_file.c_str());
|
LOG_INF("%s\n", context_file.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<chunk> chunks;
|
std::vector<chunk> chunks;
|
||||||
@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
|
|||||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||||
}
|
}
|
||||||
printf("Number of chunks: %ld\n", chunks.size());
|
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -164,19 +164,19 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
LOG_ERR("%s: pooling type NONE not supported\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (auto & chunk : chunks) {
|
for (auto & chunk : chunks) {
|
||||||
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -201,12 +201,12 @@ int main(int argc, char ** argv) {
|
|||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) chunks.size(); i++) {
|
for (int i = 0; i < (int) chunks.size(); i++) {
|
||||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||||
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n\n");
|
LOG_INF("\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -258,7 +258,7 @@ int main(int argc, char ** argv) {
|
|||||||
// start loop, receive query and return top k similar chunks based on cosine similarity
|
// start loop, receive query and return top k similar chunks based on cosine similarity
|
||||||
std::string query;
|
std::string query;
|
||||||
while (true) {
|
while (true) {
|
||||||
printf("Enter query: ");
|
LOG("Enter query: ");
|
||||||
std::getline(std::cin, query);
|
std::getline(std::cin, query);
|
||||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
||||||
|
|
||||||
@ -282,20 +282,22 @@ int main(int argc, char ** argv) {
|
|||||||
return a.second > b.second;
|
return a.second > b.second;
|
||||||
});
|
});
|
||||||
|
|
||||||
printf("Top %d similar chunks:\n", params.sparams.top_k);
|
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
||||||
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
||||||
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||||
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||||
printf("similarity: %f\n", similarities[i].second);
|
LOG("similarity: %f\n", similarities[i].second);
|
||||||
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
||||||
printf("--------------------\n");
|
LOG("--------------------\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("\n");
|
||||||
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
llama_batch_free(query_batch);
|
llama_batch_free(query_batch);
|
||||||
llama_print_timings(ctx);
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
|
|||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart TD
|
||||||
rpcb---|TCP|srva
|
rpcb<-->|TCP|srva
|
||||||
rpcb---|TCP|srvb
|
rpcb<-->|TCP|srvb
|
||||||
rpcb-.-|TCP|srvn
|
rpcb<-.->|TCP|srvn
|
||||||
subgraph hostn[Host N]
|
subgraph hostn[Host N]
|
||||||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph hostb[Host B]
|
subgraph hostb[Host B]
|
||||||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph hosta[Host A]
|
subgraph hosta[Host A]
|
||||||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph host[Main Host]
|
subgraph host[Main Host]
|
||||||
ggml[llama.cpp]---rpcb[RPC backend]
|
local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
|
||||||
|
ggml[llama-cli]<-->rpcb[RPC backend]
|
||||||
end
|
end
|
||||||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||||
```
|
```
|
||||||
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
|||||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
|
||||||
|
Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||||
```bash
|
|
||||||
mkdir build-rpc
|
|
||||||
cd build-rpc
|
|
||||||
cmake .. -DGGML_RPC=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This way you can offload model layers to both local and remote devices.
|
||||||
|
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <chrono>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
|
params.sparams.seed = 1234;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,6 +38,13 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
// tokenize prompt
|
// tokenize prompt
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
@ -64,16 +71,7 @@ int main(int argc, char ** argv) {
|
|||||||
printf("\nfirst run: %s", params.prompt.c_str());
|
printf("\nfirst run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx);
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
@ -96,6 +94,11 @@ int main(int argc, char ** argv) {
|
|||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl2, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
printf("\nsecond run: %s", params.prompt.c_str());
|
printf("\nsecond run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
// load state (rng, logits, embedding and kv_cache) from file
|
// load state (rng, logits, embedding and kv_cache) from file
|
||||||
@ -124,15 +127,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// second run
|
// second run
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx2);
|
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx2, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
@ -157,7 +152,12 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
||||||
|
|
||||||
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
llama_sampler_chain_add(smpl3, llama_sampler_init_softmax());
|
||||||
|
llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||||
|
|
||||||
@ -215,15 +215,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// third run with seq 1 instead of 0
|
// third run with seq 1 instead of 0
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto * logits = llama_get_logits(ctx3);
|
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||||
auto n_vocab = llama_n_vocab(model);
|
|
||||||
std::vector<llama_token_data> candidates;
|
|
||||||
candidates.reserve(n_vocab);
|
|
||||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
||||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
|
||||||
}
|
|
||||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
||||||
auto next_token = llama_sample_token(ctx3, &candidates_p);
|
|
||||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
@ -240,6 +232,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
llama_sampler_free(smpl2);
|
||||||
|
llama_sampler_free(smpl3);
|
||||||
|
|
||||||
llama_free(ctx3);
|
llama_free(ctx3);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
set(TARGET llama-server)
|
set(TARGET llama-server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
|
||||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
|
|||||||
system-prompts.js
|
system-prompts.js
|
||||||
prompt-formats.js
|
prompt-formats.js
|
||||||
json-schema-to-grammar.mjs
|
json-schema-to-grammar.mjs
|
||||||
|
loading.html
|
||||||
)
|
)
|
||||||
|
|
||||||
foreach(asset ${PUBLIC_ASSETS})
|
foreach(asset ${PUBLIC_ASSETS})
|
||||||
@ -45,9 +46,6 @@ endforeach()
|
|||||||
|
|
||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
|
@ -17,255 +17,144 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
| Argument | Explanation |
|
||||||
|
| -------- | ----------- |
|
||||||
|
| `-h, --help, --usage` | print usage and exit |
|
||||||
|
| `--version` | show version and build info |
|
||||||
|
| `-v, --verbose` | print verbose information |
|
||||||
|
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||||
|
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||||
|
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||||
|
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||||
|
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||||
|
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||||
|
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
|
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||||
|
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||||
|
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||||
|
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||||
|
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
|
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||||
|
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||||
|
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||||
|
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||||
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
|
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||||
|
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||||
|
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||||
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
|
| `--no-escape` | do not process escape sequences |
|
||||||
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||||
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||||
|
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||||
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
|
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||||
|
| `--temp N` | temperature (default: 0.8) |
|
||||||
|
| `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
|
||||||
|
| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
|
||||||
|
| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
|
||||||
|
| `--tfs N` | tail free sampling, parameter z (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||||
|
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
|
||||||
|
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
|
||||||
|
| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
|
||||||
|
| `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
|
||||||
|
| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) |
|
||||||
|
| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) |
|
||||||
|
| `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,<br/>i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',<br/>or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' |
|
||||||
|
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
|
||||||
|
| `--grammar-file FNAME` | file to read grammar from |
|
||||||
|
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
|
||||||
|
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model |
|
||||||
|
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N |
|
||||||
|
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model) |
|
||||||
|
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N |
|
||||||
|
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size) |
|
||||||
|
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation) |
|
||||||
|
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0) |
|
||||||
|
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0) |
|
||||||
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0) |
|
||||||
|
| `-gan, --grp-attn-n N` | group-attention factor (default: 1) |
|
||||||
|
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0) |
|
||||||
|
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||||
|
| `-nkvo, --no-kv-offload` | disable KV offload |
|
||||||
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16) |
|
||||||
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||||
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||||
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
|
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||||
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||||
|
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||||
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||||
|
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||||
|
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||||
|
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||||
|
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||||
|
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||||
|
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||||
|
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||||
|
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||||
|
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||||
|
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||||
|
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||||
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||||
|
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||||
|
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||||
|
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||||
|
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||||
|
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||||
|
| `--host HOST` | ip address to listen (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||||
|
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||||
|
| `--path PATH` | path to serve static files from (default: ) |
|
||||||
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
|
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
|
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||||
|
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||||
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||||
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||||
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
|
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||||
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
|
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
|
||||||
|
| `--log-test` | Log test |
|
||||||
|
| `--log-disable` | Log disable |
|
||||||
|
| `--log-enable` | Log enable |
|
||||||
|
| `--log-new` | Log new |
|
||||||
|
| `--log-append` | Log append |
|
||||||
|
| `--log-file FNAME` | Log file |
|
||||||
|
|
||||||
|
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||||
|
|
||||||
|
Example usage of docker compose with environment variables:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
services:
|
||||||
|
llamacpp-server:
|
||||||
|
image: ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
ports:
|
||||||
|
- 8080:8080
|
||||||
|
volumes:
|
||||||
|
- ./models:/models
|
||||||
|
environment:
|
||||||
|
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
|
||||||
|
LLAMA_ARG_MODEL: /models/my_model.gguf
|
||||||
|
LLAMA_ARG_CTX_SIZE: 4096
|
||||||
|
LLAMA_ARG_N_PARALLEL: 2
|
||||||
|
LLAMA_ARG_ENDPOINT_METRICS: 1
|
||||||
|
LLAMA_ARG_PORT: 8080
|
||||||
```
|
```
|
||||||
usage: ./llama-server [options]
|
|
||||||
|
|
||||||
general:
|
|
||||||
|
|
||||||
-h, --help, --usage print usage and exit
|
|
||||||
--version show version and build info
|
|
||||||
-v, --verbose print verbose information
|
|
||||||
--verbosity N set specific verbosity level (default: 0)
|
|
||||||
--verbose-prompt print a verbose prompt before generation (default: false)
|
|
||||||
--no-display-prompt don't print prompt at generation (default: false)
|
|
||||||
-co, --color colorise output to distinguish prompt and user input from generations (default: false)
|
|
||||||
-s, --seed SEED RNG seed (default: -1, use random seed for < 0)
|
|
||||||
-t, --threads N number of threads to use during generation (default: 8)
|
|
||||||
-tb, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)
|
|
||||||
-td, --threads-draft N number of threads to use during generation (default: same as --threads)
|
|
||||||
-tbd, --threads-batch-draft N number of threads to use during batch and prompt processing (default: same as --threads-draft)
|
|
||||||
--draft N number of tokens to draft for speculative decoding (default: 5)
|
|
||||||
-ps, --p-split N speculative decoding split probability (default: 0.1)
|
|
||||||
-lcs, --lookup-cache-static FNAME
|
|
||||||
path to static lookup cache to use for lookup decoding (not updated by generation)
|
|
||||||
-lcd, --lookup-cache-dynamic FNAME
|
|
||||||
path to dynamic lookup cache to use for lookup decoding (updated by generation)
|
|
||||||
-c, --ctx-size N size of the prompt context (default: 0, 0 = loaded from model)
|
|
||||||
-n, --predict N number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)
|
|
||||||
-b, --batch-size N logical maximum batch size (default: 2048)
|
|
||||||
-ub, --ubatch-size N physical maximum batch size (default: 512)
|
|
||||||
--keep N number of tokens to keep from the initial prompt (default: 0, -1 = all)
|
|
||||||
--chunks N max number of chunks to process (default: -1, -1 = all)
|
|
||||||
-fa, --flash-attn enable Flash Attention (default: disabled)
|
|
||||||
-p, --prompt PROMPT prompt to start generation with
|
|
||||||
in conversation mode, this will be used as system prompt
|
|
||||||
(default: '')
|
|
||||||
-f, --file FNAME a file containing the prompt (default: none)
|
|
||||||
--in-file FNAME an input file (repeat to specify multiple files)
|
|
||||||
-bf, --binary-file FNAME binary file containing the prompt (default: none)
|
|
||||||
-e, --escape process escapes sequences (\n, \r, \t, \', \", \\) (default: true)
|
|
||||||
--no-escape do not process escape sequences
|
|
||||||
-ptc, --print-token-count N print token count every N tokens (default: -1)
|
|
||||||
--prompt-cache FNAME file to cache prompt state for faster startup (default: none)
|
|
||||||
--prompt-cache-all if specified, saves user input and generations to cache as well
|
|
||||||
not supported with --interactive or other interactive options
|
|
||||||
--prompt-cache-ro if specified, uses the prompt cache but does not update it
|
|
||||||
-r, --reverse-prompt PROMPT halt generation at PROMPT, return control in interactive mode
|
|
||||||
can be specified more than once for multiple prompts
|
|
||||||
-sp, --special special tokens output enabled (default: false)
|
|
||||||
-cnv, --conversation run in conversation mode, does not print special tokens and suffix/prefix
|
|
||||||
if suffix/prefix are not specified, default chat template will be used
|
|
||||||
(default: false)
|
|
||||||
-i, --interactive run in interactive mode (default: false)
|
|
||||||
-if, --interactive-first run in interactive mode and wait for input right away (default: false)
|
|
||||||
-mli, --multiline-input allows you to write or paste multiple lines without ending each in '\'
|
|
||||||
--in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string
|
|
||||||
--in-prefix STRING string to prefix user inputs with (default: empty)
|
|
||||||
--in-suffix STRING string to suffix after user inputs with (default: empty)
|
|
||||||
--spm-infill use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled)
|
|
||||||
|
|
||||||
sampling:
|
|
||||||
|
|
||||||
--samplers SAMPLERS samplers that will be used for generation in the order, separated by ';'
|
|
||||||
(default: top_k;tfs_z;typical_p;top_p;min_p;temperature)
|
|
||||||
--sampling-seq SEQUENCE simplified sequence for samplers that will be used (default: kfypmt)
|
|
||||||
--ignore-eos ignore end of stream token and continue generating (implies --logit-bias EOS-inf)
|
|
||||||
--penalize-nl penalize newline tokens (default: false)
|
|
||||||
--temp N temperature (default: 0.8)
|
|
||||||
--top-k N top-k sampling (default: 40, 0 = disabled)
|
|
||||||
--top-p N top-p sampling (default: 0.9, 1.0 = disabled)
|
|
||||||
--min-p N min-p sampling (default: 0.1, 0.0 = disabled)
|
|
||||||
--tfs N tail free sampling, parameter z (default: 1.0, 1.0 = disabled)
|
|
||||||
--typical N locally typical sampling, parameter p (default: 1.0, 1.0 = disabled)
|
|
||||||
--repeat-last-n N last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size)
|
|
||||||
--repeat-penalty N penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled)
|
|
||||||
--presence-penalty N repeat alpha presence penalty (default: 0.0, 0.0 = disabled)
|
|
||||||
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
|
|
||||||
--dynatemp-range N dynamic temperature range (default: 0.0, 0.0 = disabled)
|
|
||||||
--dynatemp-exp N dynamic temperature exponent (default: 1.0)
|
|
||||||
--mirostat N use Mirostat sampling.
|
|
||||||
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
|
|
||||||
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)
|
|
||||||
--mirostat-lr N Mirostat learning rate, parameter eta (default: 0.1)
|
|
||||||
--mirostat-ent N Mirostat target entropy, parameter tau (default: 5.0)
|
|
||||||
-l TOKEN_ID(+/-)BIAS modifies the likelihood of token appearing in the completion,
|
|
||||||
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
|
|
||||||
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'
|
|
||||||
--cfg-negative-prompt PROMPT
|
|
||||||
negative prompt to use for guidance (default: '')
|
|
||||||
--cfg-negative-prompt-file FNAME
|
|
||||||
negative prompt file to use for guidance
|
|
||||||
--cfg-scale N strength of guidance (default: 1.0, 1.0 = disable)
|
|
||||||
--chat-template JINJA_TEMPLATE
|
|
||||||
set custom jinja chat template (default: template taken from model's metadata)
|
|
||||||
if suffix/prefix are specified, template will be disabled
|
|
||||||
only commonly used templates are accepted:
|
|
||||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
||||||
|
|
||||||
grammar:
|
|
||||||
|
|
||||||
--grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '')
|
|
||||||
--grammar-file FNAME file to read grammar from
|
|
||||||
-j, --json-schema SCHEMA JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
|
|
||||||
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead
|
|
||||||
|
|
||||||
embedding:
|
|
||||||
|
|
||||||
--pooling {none,mean,cls,last}
|
|
||||||
pooling type for embeddings, use model default if unspecified
|
|
||||||
--attention {causal,non-causal}
|
|
||||||
attention type for embeddings, use model default if unspecified
|
|
||||||
|
|
||||||
context hacking:
|
|
||||||
|
|
||||||
--rope-scaling {none,linear,yarn}
|
|
||||||
RoPE frequency scaling method, defaults to linear unless specified by the model
|
|
||||||
--rope-scale N RoPE context scaling factor, expands context by a factor of N
|
|
||||||
--rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
|
|
||||||
--rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N
|
|
||||||
--yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)
|
|
||||||
--yarn-ext-factor N YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
|
|
||||||
--yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
|
||||||
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
|
|
||||||
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
|
|
||||||
-gan, --grp-attn-n N group-attention factor (default: 1)
|
|
||||||
-gaw, --grp-attn-w N group-attention width (default: 512.0)
|
|
||||||
-dkvc, --dump-kv-cache verbose print of the KV cache
|
|
||||||
-nkvo, --no-kv-offload disable KV offload
|
|
||||||
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
|
|
||||||
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
|
|
||||||
|
|
||||||
perplexity:
|
|
||||||
|
|
||||||
--all-logits return logits for all tokens in the batch (default: false)
|
|
||||||
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
|
|
||||||
--hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: 400)
|
|
||||||
--winogrande compute Winogrande score over random tasks from datafile supplied with -f
|
|
||||||
--winogrande-tasks N number of tasks to use when computing the Winogrande score (default: 0)
|
|
||||||
--multiple-choice compute multiple choice score over random tasks from datafile supplied with -f
|
|
||||||
--multiple-choice-tasks N
|
|
||||||
number of tasks to use when computing the multiple choice score (default: 0)
|
|
||||||
--kl-divergence computes KL-divergence to logits provided via --kl-divergence-base
|
|
||||||
--ppl-stride N stride for perplexity calculation (default: 0)
|
|
||||||
--ppl-output-type {0,1} output type for perplexity calculation (default: 0)
|
|
||||||
|
|
||||||
parallel:
|
|
||||||
|
|
||||||
-dt, --defrag-thold N KV cache defragmentation threshold (default: -1.0, < 0 - disabled)
|
|
||||||
-np, --parallel N number of parallel sequences to decode (default: 1)
|
|
||||||
-ns, --sequences N number of sequences to decode (default: 1)
|
|
||||||
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
|
|
||||||
|
|
||||||
multi-modality:
|
|
||||||
|
|
||||||
--mmproj FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md
|
|
||||||
--image FILE path to an image file. use with multimodal models. Specify multiple times for batching
|
|
||||||
|
|
||||||
backend:
|
|
||||||
|
|
||||||
--rpc SERVERS comma separated list of RPC servers
|
|
||||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
|
||||||
--no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)
|
|
||||||
--numa TYPE attempt optimizations that help on some NUMA systems
|
|
||||||
- distribute: spread execution evenly over all nodes
|
|
||||||
- isolate: only spawn threads on CPUs on the node that execution started on
|
|
||||||
- numactl: use the CPU map provided by numactl
|
|
||||||
if run without this previously, it is recommended to drop the system page cache before using this
|
|
||||||
see https://github.com/ggerganov/llama.cpp/issues/1437
|
|
||||||
|
|
||||||
model:
|
|
||||||
|
|
||||||
--check-tensors check model tensor data for invalid values (default: false)
|
|
||||||
--override-kv KEY=TYPE:VALUE
|
|
||||||
advanced option to override model metadata by key. may be specified multiple times.
|
|
||||||
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false
|
|
||||||
--lora FNAME apply LoRA adapter (implies --no-mmap)
|
|
||||||
--lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)
|
|
||||||
--lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter
|
|
||||||
--control-vector FNAME add a control vector
|
|
||||||
note: this argument can be repeated to add multiple control vectors
|
|
||||||
--control-vector-scaled FNAME SCALE
|
|
||||||
add a control vector with user defined scaling SCALE
|
|
||||||
note: this argument can be repeated to add multiple scaled control vectors
|
|
||||||
--control-vector-layer-range START END
|
|
||||||
layer range to apply the control vector(s) to, start and end inclusive
|
|
||||||
-m, --model FNAME model path (default: models/$filename with filename from --hf-file
|
|
||||||
or --model-url if set, otherwise models/7B/ggml-model-f16.gguf)
|
|
||||||
-md, --model-draft FNAME draft model for speculative decoding (default: unused)
|
|
||||||
-mu, --model-url MODEL_URL model download url (default: unused)
|
|
||||||
-hfr, --hf-repo REPO Hugging Face model repository (default: unused)
|
|
||||||
-hff, --hf-file FILE Hugging Face model file (default: unused)
|
|
||||||
-hft, --hf-token TOKEN Hugging Face access token (default: value from HF_TOKEN environment variable)
|
|
||||||
|
|
||||||
server:
|
|
||||||
|
|
||||||
--host HOST ip address to listen (default: 127.0.0.1)
|
|
||||||
--port PORT port to listen (default: 8080)
|
|
||||||
--path PATH path to serve static files from (default: )
|
|
||||||
--embedding(s) restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
|
|
||||||
--api-key KEY API key to use for authentication (default: none)
|
|
||||||
--api-key-file FNAME path to file containing API keys (default: none)
|
|
||||||
--ssl-key-file FNAME path to file a PEM-encoded SSL private key
|
|
||||||
--ssl-cert-file FNAME path to file a PEM-encoded SSL certificate
|
|
||||||
--timeout N server read/write timeout in seconds (default: 600)
|
|
||||||
--threads-http N number of threads used to process HTTP requests (default: -1)
|
|
||||||
--system-prompt-file FNAME
|
|
||||||
set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications
|
|
||||||
--log-format {text,json}
|
|
||||||
log output format: json or text (default: json)
|
|
||||||
--metrics enable prometheus compatible metrics endpoint (default: disabled)
|
|
||||||
--no-slots disables slots monitoring endpoint (default: enabled)
|
|
||||||
--slot-save-path PATH path to save slot kv cache (default: disabled)
|
|
||||||
--chat-template JINJA_TEMPLATE
|
|
||||||
set custom jinja chat template (default: template taken from model's metadata)
|
|
||||||
only commonly used templates are accepted:
|
|
||||||
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
|
||||||
-sps, --slot-prompt-similarity SIMILARITY
|
|
||||||
how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
|
|
||||||
--lora-init-without-apply
|
|
||||||
load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
|
|
||||||
|
|
||||||
logging:
|
|
||||||
|
|
||||||
--simple-io use basic IO for better compatibility in subprocesses and limited consoles
|
|
||||||
-ld, --logdir LOGDIR path under which to save YAML logs (no logging if unset)
|
|
||||||
--log-test Run simple logging test
|
|
||||||
--log-disable Disable trace logs
|
|
||||||
--log-enable Enable trace logs
|
|
||||||
--log-file FNAME Specify a log filename (without extension)
|
|
||||||
--log-new Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
|
|
||||||
--log-append Don't truncate the old log file.
|
|
||||||
```
|
|
||||||
|
|
||||||
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
|
||||||
|
|
||||||
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
|
|
||||||
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
|
|
||||||
- `LLAMA_ARG_MODEL`
|
|
||||||
- `LLAMA_ARG_THREADS`
|
|
||||||
- `LLAMA_ARG_CTX_SIZE`
|
|
||||||
- `LLAMA_ARG_N_PARALLEL`
|
|
||||||
- `LLAMA_ARG_BATCH`
|
|
||||||
- `LLAMA_ARG_UBATCH`
|
|
||||||
- `LLAMA_ARG_N_GPU_LAYERS`
|
|
||||||
- `LLAMA_ARG_THREADS_HTTP`
|
|
||||||
- `LLAMA_ARG_CHAT_TEMPLATE`
|
|
||||||
- `LLAMA_ARG_N_PREDICT`
|
|
||||||
- `LLAMA_ARG_ENDPOINT_METRICS`
|
|
||||||
- `LLAMA_ARG_ENDPOINT_SLOTS`
|
|
||||||
- `LLAMA_ARG_EMBEDDINGS`
|
|
||||||
- `LLAMA_ARG_FLASH_ATTN`
|
|
||||||
- `LLAMA_ARG_DEFRAG_THOLD`
|
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
@ -444,8 +333,6 @@ node index.js
|
|||||||
|
|
||||||
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
|
`frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
|
||||||
|
|
||||||
`penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`.
|
|
||||||
|
|
||||||
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
|
`mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
|
||||||
|
|
||||||
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
|
`mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
|
||||||
@ -519,9 +406,44 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: (Required) The text to tokenize.
|
||||||
|
|
||||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||||
|
|
||||||
|
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
|
||||||
|
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
If `with_pieces` is `false`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [123, 456, 789]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
If `with_pieces` is `true`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 123, "piece": "Hello"},
|
||||||
|
{"id": 456, "piece": " world"},
|
||||||
|
{"id": 789, "piece": "!"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 198, "piece": [195]}, // hex C3
|
||||||
|
{"id": 164, "piece": [161]} // hex A1
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### POST `/detokenize`: Convert tokens to text
|
### POST `/detokenize`: Convert tokens to text
|
||||||
|
|
||||||
@ -579,7 +501,7 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
|
|||||||
|
|
||||||
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
|
||||||
|
|
||||||
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
|
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
|
||||||
|
|
||||||
*Examples:*
|
*Examples:*
|
||||||
|
|
||||||
@ -698,7 +620,6 @@ Example:
|
|||||||
"stopping_word": ""
|
"stopping_word": ""
|
||||||
},
|
},
|
||||||
"penalize_nl": true,
|
"penalize_nl": true,
|
||||||
"penalty_prompt_tokens": [],
|
|
||||||
"presence_penalty": 0.0,
|
"presence_penalty": 0.0,
|
||||||
"prompt": "Say hello to llama.cpp",
|
"prompt": "Say hello to llama.cpp",
|
||||||
"repeat_last_n": 64,
|
"repeat_last_n": 64,
|
||||||
@ -722,8 +643,7 @@ Example:
|
|||||||
"tfs_z": 1.0,
|
"tfs_z": 1.0,
|
||||||
"top_k": 40,
|
"top_k": 40,
|
||||||
"top_p": 0.949999988079071,
|
"top_p": 0.949999988079071,
|
||||||
"typical_p": 1.0,
|
"typical_p": 1.0
|
||||||
"use_penalty_prompt_tokens": false
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
@ -40,7 +40,6 @@ server --host localhost --port 8080 \
|
|||||||
--parallel 8 \
|
--parallel 8 \
|
||||||
--batch-size 512 \
|
--batch-size 512 \
|
||||||
--ctx-size 4096 \
|
--ctx-size 4096 \
|
||||||
--log-format text \
|
|
||||||
-ngl 33
|
-ngl 33
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -272,7 +272,6 @@ def start_server_background(args):
|
|||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
server_args.append('--metrics')
|
server_args.append('--metrics')
|
||||||
server_args.append('--flash-attn')
|
server_args.append('--flash-attn')
|
||||||
server_args.extend(['--log-format', "text"])
|
|
||||||
args = [str(arg) for arg in [server_path, *server_args]]
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
pkwargs = {
|
pkwargs = {
|
||||||
|
File diff suppressed because one or more lines are too long
12
examples/server/public/loading.html
Normal file
12
examples/server/public/loading.html
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="refresh" content="5">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="loading">
|
||||||
|
The model is loading. Please wait.<br/>
|
||||||
|
The user interface will appear soon.
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
File diff suppressed because it is too large
Load Diff
1
examples/server/tests/.gitignore
vendored
Normal file
1
examples/server/tests/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.venv
|
@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
|
|||||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
|
||||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||||
|
|
||||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||||
|
@ -9,8 +9,11 @@ Feature: llama.cpp server
|
|||||||
And a model alias bert-bge-small
|
And a model alias bert-bge-small
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 2 slots
|
And 2 slots
|
||||||
And 1024 as batch size
|
# the bert-bge-small model has context size of 512
|
||||||
And 1024 as ubatch size
|
# since the generated prompts are as big as the batch size, we need to set the batch size to 512
|
||||||
|
# ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
|
||||||
|
And 512 as batch size
|
||||||
|
And 512 as ubatch size
|
||||||
And 2048 KV cache size
|
And 2048 KV cache size
|
||||||
And embeddings extraction
|
And embeddings extraction
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
@ -77,6 +77,35 @@ Feature: Parallel
|
|||||||
| disabled | 128 |
|
| disabled | 128 |
|
||||||
| enabled | 64 |
|
| enabled | 64 |
|
||||||
|
|
||||||
|
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||||
|
Given a system prompt You are a writer.
|
||||||
|
And a model tinyllama-2
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long book.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another a poem.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
What is LLM?
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
The sky is blue and I love it.
|
||||||
|
"""
|
||||||
|
And <n_predict> max tokens to predict
|
||||||
|
And streaming is <streaming>
|
||||||
|
Given concurrent OAI completions requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all prompts are predicted with <n_predict> tokens
|
||||||
|
Examples:
|
||||||
|
| streaming | n_predict |
|
||||||
|
| disabled | 128 |
|
||||||
|
| enabled | 64 |
|
||||||
|
|
||||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user