mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
Merge branch 'master' into compilade/convert-separate-extra-tensors
This commit is contained in:
commit
ed0f2c4ab1
19
.github/workflows/build.yml
vendored
19
.github/workflows/build.yml
vendored
@ -23,6 +23,9 @@ env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
GGML_NLOOP: 3
|
||||
GGML_N_THREADS: 1
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
|
||||
jobs:
|
||||
macOS-latest-cmake-arm64:
|
||||
@ -375,7 +378,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@ -401,7 +404,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@ -442,7 +445,7 @@ jobs:
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
@ -546,7 +549,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@ -576,7 +579,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@ -610,7 +613,7 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v1
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
@ -969,14 +972,14 @@ jobs:
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install
|
||||
id: depends
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "Downloading AMD HIP SDK Installer"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
|
||||
write-host "Installing AMD HIP SDK"
|
||||
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
|
||||
write-host "Completed AMD HIP SDK installation"
|
||||
|
7
.github/workflows/server.yml
vendored
7
.github/workflows/server.yml
vendored
@ -20,6 +20,12 @@ on:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||
|
||||
env:
|
||||
LLAMA_LOG_COLORS: 1
|
||||
LLAMA_LOG_PREFIX: 1
|
||||
LLAMA_LOG_TIMESTAMPS: 1
|
||||
LLAMA_LOG_VERBOSITY: 10
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
@ -173,6 +179,7 @@ jobs:
|
||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||
run: |
|
||||
cd examples/server/tests
|
||||
$env:PYTHONIOENCODING = ":replace"
|
||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
|
||||
- name: Slow tests
|
||||
|
@ -82,11 +82,11 @@ set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
|
||||
|
||||
# change the default for these ggml options
|
||||
if (NOT DEFINED GGML_LLAMAFILE)
|
||||
set(GGML_LLAMAFILE ON)
|
||||
set(GGML_LLAMAFILE_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
|
||||
set(GGML_CUDA_USE_GRAPHS ON)
|
||||
if (NOT DEFINED GGML_CUDA_GRAPHS)
|
||||
set(GGML_CUDA_GRAPHS_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
# transition helpers
|
||||
@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
|
||||
# determining _precisely_ which defines are necessary for the llama-config
|
||||
# package.
|
||||
#
|
||||
set(GGML_TRANSIENT_DEFINES)
|
||||
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
|
||||
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
|
||||
if (GGML_DIR_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
|
||||
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
|
||||
if (GGML_TARGET_DEFINES)
|
||||
list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
|
||||
endif()
|
||||
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
|
||||
|
||||
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
|
||||
|
57
Makefile
57
Makefile
@ -54,6 +54,7 @@ TEST_TARGETS = \
|
||||
tests/test-grammar-parser \
|
||||
tests/test-json-schema-to-grammar \
|
||||
tests/test-llama-grammar \
|
||||
tests/test-log \
|
||||
tests/test-model-load-cancel \
|
||||
tests/test-opt \
|
||||
tests/test-quantize-fns \
|
||||
@ -148,6 +149,14 @@ GGML_NO_METAL := 1
|
||||
DEPRECATE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DISABLE_LOGS
|
||||
REMOVE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_VERBOSE
|
||||
REMOVE_WARNING := 1
|
||||
endif
|
||||
|
||||
ifndef UNAME_S
|
||||
UNAME_S := $(shell uname -s)
|
||||
endif
|
||||
@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
|
||||
MK_LDFLAGS += -fsanitize=undefined -g
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_VERBOSE
|
||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
||||
endif
|
||||
|
||||
ifdef LLAMA_SERVER_SSL
|
||||
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
||||
MK_LDFLAGS += -lssl -lcrypto
|
||||
endif
|
||||
|
||||
ifdef LLAMA_DISABLE_LOGS
|
||||
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
|
||||
endif # LLAMA_DISABLE_LOGS
|
||||
|
||||
# warnings
|
||||
WARN_FLAGS = \
|
||||
-Wall \
|
||||
@ -434,7 +435,7 @@ endif
|
||||
# TODO: probably these flags need to be tweaked on some architectures
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
|
||||
ifndef RISCV
|
||||
ifndef RISCV_CROSS_COMPILE
|
||||
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||
# Use all CPU extensions that are available:
|
||||
@ -514,7 +515,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
|
||||
MK_CXXFLAGS += -mlasx
|
||||
endif
|
||||
|
||||
else
|
||||
ifneq ($(filter riscv64%,$(UNAME_M)),)
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
else # RISC-V CROSS COMPILATION
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
@ -613,7 +619,7 @@ ifdef GGML_CUDA
|
||||
CUDA_PATH ?= /usr/local/cuda
|
||||
endif
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
||||
MK_CPPFLAGS += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||
MK_NVCCFLAGS += -use_fast_math
|
||||
endif # GGML_MUSA
|
||||
@ -925,6 +931,8 @@ OBJ_LLAMA = \
|
||||
|
||||
OBJ_COMMON = \
|
||||
common/common.o \
|
||||
common/arg.o \
|
||||
common/log.o \
|
||||
common/console.o \
|
||||
common/ngram-cache.o \
|
||||
common/sampling.o \
|
||||
@ -1021,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE)
|
||||
$(info )
|
||||
endif
|
||||
|
||||
ifdef REMOVE_WARNING
|
||||
$(info !!! REMOVAL WARNING !!!)
|
||||
$(info The following LLAMA_ options have been removed and are no longer supported)
|
||||
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||
$(info )
|
||||
endif
|
||||
|
||||
#
|
||||
# Build libraries
|
||||
#
|
||||
@ -1157,6 +1173,16 @@ common/common.o: \
|
||||
include/llama.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/arg.o: \
|
||||
common/arg.cpp \
|
||||
common/arg.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/log.o: \
|
||||
common/log.cpp \
|
||||
common/log.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common/sampling.o: \
|
||||
common/sampling.cpp \
|
||||
common/sampling.h \
|
||||
@ -1335,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||
$(OBJ_GGML) $(OBJ_LLAMA)
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
@ -1429,6 +1455,7 @@ llama-server: \
|
||||
examples/server/system-prompts.js.hpp \
|
||||
examples/server/prompt-formats.js.hpp \
|
||||
examples/server/json-schema-to-grammar.mjs.hpp \
|
||||
examples/server/loading.html.hpp \
|
||||
common/json.hpp \
|
||||
common/stb_image.h \
|
||||
$(OBJ_ALL)
|
||||
@ -1448,7 +1475,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
./llama-gen-docs
|
||||
|
||||
libllava.a: examples/llava/llava.cpp \
|
||||
examples/llava/llava.h \
|
||||
@ -1517,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-log: tests/test-log.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
|
||||
$(OBJ_ALL)
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||
|
@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||
|
||||
## Hot topics
|
||||
|
||||
- *add hot topics here*
|
||||
- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
|
||||
@ -77,6 +77,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
|
||||
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
|
||||
@ -89,6 +90,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||
|
||||
@ -163,6 +165,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
@ -171,6 +174,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
|
||||
- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
|
||||
- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
|
||||
|
||||
**Infrastructure:**
|
||||
|
||||
|
@ -737,6 +737,9 @@ function gg_sum_embd_bge_small {
|
||||
|
||||
## main
|
||||
|
||||
export LLAMA_LOG_PREFIX=1
|
||||
export LLAMA_LOG_TIMESTAMPS=1
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
@ -51,19 +51,23 @@ endif()
|
||||
set(TARGET common)
|
||||
|
||||
add_library(${TARGET} STATIC
|
||||
arg.cpp
|
||||
arg.h
|
||||
base64.hpp
|
||||
common.h
|
||||
common.cpp
|
||||
sampling.h
|
||||
sampling.cpp
|
||||
console.h
|
||||
common.h
|
||||
console.cpp
|
||||
json.hpp
|
||||
console.h
|
||||
json-schema-to-grammar.cpp
|
||||
train.h
|
||||
train.cpp
|
||||
ngram-cache.h
|
||||
json.hpp
|
||||
log.cpp
|
||||
log.h
|
||||
ngram-cache.cpp
|
||||
ngram-cache.h
|
||||
sampling.cpp
|
||||
sampling.h
|
||||
train.cpp
|
||||
train.h
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
|
1994
common/arg.cpp
Normal file
1994
common/arg.cpp
Normal file
File diff suppressed because it is too large
Load Diff
77
common/arg.h
Normal file
77
common/arg.h
Normal file
@ -0,0 +1,77 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
struct llama_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
void (*handler_void) (gpt_params & params) = nullptr;
|
||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
llama_arg & set_env(const char * env);
|
||||
llama_arg & set_sparam();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool get_value_from_env(std::string & output);
|
||||
bool has_value_from_env();
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
struct gpt_params_context {
|
||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||
gpt_params & params;
|
||||
std::vector<llama_arg> options;
|
||||
void(*print_usage)(int, char **) = nullptr;
|
||||
gpt_params_context(gpt_params & params) : params(params) {}
|
||||
};
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||
|
||||
// function to be used by test-arg-parser
|
||||
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
2169
common/common.cpp
2169
common/common.cpp
File diff suppressed because it is too large
Load Diff
228
common/common.h
228
common/common.h
@ -4,20 +4,9 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include "sampling.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <thread>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <tuple>
|
||||
#include <functional>
|
||||
#include <sstream>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define DIRECTORY_SEPARATOR '\\'
|
||||
@ -56,11 +45,20 @@ struct llama_control_vector_load_info;
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
int32_t cpu_get_num_physical_cores();
|
||||
int32_t cpu_get_num_math();
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
// Common params
|
||||
//
|
||||
|
||||
enum llama_example {
|
||||
@ -78,28 +76,72 @@ enum llama_example {
|
||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||
LLAMA_EXAMPLE_LLAVA,
|
||||
LLAMA_EXAMPLE_LOOKUP,
|
||||
LLAMA_EXAMPLE_PARALLEL,
|
||||
|
||||
LLAMA_EXAMPLE_COUNT,
|
||||
};
|
||||
|
||||
enum gpt_sampler_type {
|
||||
GPT_SAMPLER_TYPE_NONE = 0,
|
||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
};
|
||||
|
||||
// dimensionality reduction methods, used by cvector-generator
|
||||
enum dimre_method {
|
||||
DIMRE_METHOD_PCA,
|
||||
DIMRE_METHOD_MEAN,
|
||||
};
|
||||
|
||||
struct cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||
bool strict_cpu = false; // Use strict CPU placement
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
// sampler parameters
|
||||
struct gpt_sampler_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
bool no_perf = false; // disable performance metrics
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
GPT_SAMPLER_TYPE_TFS_Z,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
struct gpt_params {
|
||||
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
||||
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 0; // context size
|
||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||
@ -143,23 +185,23 @@ struct gpt_params {
|
||||
|
||||
struct gpt_sampler_params sparams;
|
||||
|
||||
std::string model = ""; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
std::string model_url = ""; // model url to download
|
||||
std::string hf_token = ""; // HF token
|
||||
std::string hf_repo = ""; // HF repo
|
||||
std::string hf_file = ""; // HF file
|
||||
std::string prompt = "";
|
||||
std::string prompt_file = ""; // store the external prompt file name
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
std::string model = ""; // model path // NOLINT
|
||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||
std::string model_url = ""; // model url to download // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string hf_repo = ""; // HF repo // NOLINT
|
||||
std::string hf_file = ""; // HF file // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||
|
||||
std::vector<std::string> in_files; // all input files
|
||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||
@ -189,7 +231,6 @@ struct gpt_params {
|
||||
|
||||
bool kl_divergence = false; // compute KL divergence
|
||||
|
||||
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
||||
bool usage = false; // print usage
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool special = false; // enable special token output
|
||||
@ -204,6 +245,8 @@ struct gpt_params {
|
||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||
bool flash_attn = false; // flash attention
|
||||
bool no_perf = false; // disable performance metrics
|
||||
bool ctx_shift = true; // context shift on inifinite text generation
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
bool logits_all = false; // return logits for all tokens in the batch
|
||||
@ -220,7 +263,7 @@ struct gpt_params {
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||
std::vector<std::string> image; // path to image file(s)
|
||||
|
||||
// embedding
|
||||
@ -236,15 +279,15 @@ struct gpt_params {
|
||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
|
||||
std::string hostname = "127.0.0.1";
|
||||
std::string public_path = "";
|
||||
std::string chat_template = "";
|
||||
std::string system_prompt = "";
|
||||
std::string public_path = ""; // NOLINT
|
||||
std::string chat_template = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
bool enable_chat_template = true;
|
||||
|
||||
std::vector<std::string> api_keys;
|
||||
|
||||
std::string ssl_file_key = "";
|
||||
std::string ssl_file_cert = "";
|
||||
std::string ssl_file_key = ""; // NOLINT
|
||||
std::string ssl_file_cert = ""; // NOLINT
|
||||
|
||||
bool endpoint_slots = true;
|
||||
bool endpoint_metrics = false;
|
||||
@ -299,91 +342,9 @@ struct gpt_params {
|
||||
bool batched_bench_output_jsonl = false;
|
||||
};
|
||||
|
||||
struct llama_arg {
|
||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||
std::vector<const char *> args;
|
||||
const char * value_hint = nullptr; // help text or example for arg value
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
void (*handler_void) (gpt_params & params) = nullptr;
|
||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, int)
|
||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params)
|
||||
) : args(args), help(help), handler_void(handler) {}
|
||||
|
||||
// support 2 values for arg
|
||||
llama_arg(
|
||||
const std::initializer_list<const char *> & args,
|
||||
const char * value_hint,
|
||||
const char * value_hint_2,
|
||||
const std::string & help,
|
||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||
|
||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
||||
this->examples = std::move(examples);
|
||||
return *this;
|
||||
}
|
||||
|
||||
llama_arg & set_env(const char * env) {
|
||||
help = help + "\n(env: " + env + ")";
|
||||
this->env = env;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool in_example(enum llama_example ex) {
|
||||
return examples.find(ex) != examples.end();
|
||||
}
|
||||
|
||||
bool get_value_from_env(std::string & output) const {
|
||||
if (env == nullptr) return false;
|
||||
char * value = std::getenv(env);
|
||||
if (value) {
|
||||
output = value;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool has_value_from_env() const {
|
||||
return env != nullptr && std::getenv(env);
|
||||
}
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
// initialize list of options (arguments) that can be used by the current example
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
||||
// optionally, we can provide "print_usage" to print example usage
|
||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
||||
|
||||
// parse input arguments from CLI
|
||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
||||
|
||||
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
||||
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
||||
// call once at the start of a program if it uses libcommon
|
||||
// initializes the logging system and prints info about the build
|
||||
void gpt_init();
|
||||
|
||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||
|
||||
@ -420,6 +381,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||
void string_process_escapes(std::string & input);
|
||||
|
||||
std::string string_from(bool value);
|
||||
std::string string_from(const std::vector<int> & values);
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
401
common/log.cpp
Normal file
401
common/log.cpp
Normal file
@ -0,0 +1,401 @@
|
||||
#include "log.h"
|
||||
|
||||
#include <condition_variable>
|
||||
#include <cstdarg>
|
||||
#include <cstdio>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||
|
||||
void gpt_log_set_verbosity_thold(int verbosity) {
|
||||
gpt_log_verbosity_thold = verbosity;
|
||||
}
|
||||
|
||||
#define LOG_COL_DEFAULT "\033[0m"
|
||||
#define LOG_COL_BOLD "\033[1m"
|
||||
#define LOG_COL_RED "\033[31m"
|
||||
#define LOG_COL_GREEN "\033[32m"
|
||||
#define LOG_COL_YELLOW "\033[33m"
|
||||
#define LOG_COL_BLUE "\033[34m"
|
||||
#define LOG_COL_MAGENTA "\033[35m"
|
||||
#define LOG_COL_CYAN "\033[36m"
|
||||
#define LOG_COL_WHITE "\033[37m"
|
||||
|
||||
static int64_t t_us() {
|
||||
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
||||
// colors
|
||||
enum gpt_log_col : int {
|
||||
GPT_LOG_COL_DEFAULT = 0,
|
||||
GPT_LOG_COL_BOLD,
|
||||
GPT_LOG_COL_RED,
|
||||
GPT_LOG_COL_GREEN,
|
||||
GPT_LOG_COL_YELLOW,
|
||||
GPT_LOG_COL_BLUE,
|
||||
GPT_LOG_COL_MAGENTA,
|
||||
GPT_LOG_COL_CYAN,
|
||||
GPT_LOG_COL_WHITE,
|
||||
};
|
||||
|
||||
// disable colors by default
|
||||
static std::vector<const char *> g_col = {
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
};
|
||||
|
||||
struct gpt_log_entry {
|
||||
enum ggml_log_level level;
|
||||
|
||||
bool prefix;
|
||||
|
||||
int64_t timestamp;
|
||||
|
||||
std::vector<char> msg;
|
||||
|
||||
// signals the worker thread to stop
|
||||
bool is_end;
|
||||
|
||||
void print(FILE * file = nullptr) const {
|
||||
FILE * fcur = file;
|
||||
if (!fcur) {
|
||||
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
||||
// these messages will still be logged to a file
|
||||
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
||||
return;
|
||||
}
|
||||
|
||||
fcur = stdout;
|
||||
|
||||
if (level != GGML_LOG_LEVEL_NONE) {
|
||||
fcur = stderr;
|
||||
}
|
||||
}
|
||||
|
||||
if (level != GGML_LOG_LEVEL_NONE && prefix) {
|
||||
if (timestamp) {
|
||||
// [M.s.ms.us]
|
||||
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
||||
g_col[GPT_LOG_COL_BLUE],
|
||||
(int) (timestamp / 1000000 / 60),
|
||||
(int) (timestamp / 1000000 % 60),
|
||||
(int) (timestamp / 1000 % 1000),
|
||||
(int) (timestamp % 1000),
|
||||
g_col[GPT_LOG_COL_DEFAULT]);
|
||||
}
|
||||
|
||||
switch (level) {
|
||||
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
||||
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
||||
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
||||
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(fcur, "%s", msg.data());
|
||||
|
||||
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
||||
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
||||
}
|
||||
|
||||
fflush(fcur);
|
||||
}
|
||||
};
|
||||
|
||||
struct gpt_log {
|
||||
// default capacity - will be expanded if needed
|
||||
gpt_log() : gpt_log(256) {}
|
||||
|
||||
gpt_log(size_t capacity) {
|
||||
file = nullptr;
|
||||
prefix = false;
|
||||
timestamps = false;
|
||||
running = false;
|
||||
t_start = t_us();
|
||||
|
||||
// initial message size - will be expanded if longer messages arrive
|
||||
entries.resize(capacity);
|
||||
for (auto & entry : entries) {
|
||||
entry.msg.resize(256);
|
||||
}
|
||||
|
||||
head = 0;
|
||||
tail = 0;
|
||||
|
||||
resume();
|
||||
}
|
||||
|
||||
~gpt_log() {
|
||||
pause();
|
||||
if (file) {
|
||||
fclose(file);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::mutex mtx;
|
||||
std::thread thrd;
|
||||
std::condition_variable cv;
|
||||
|
||||
FILE * file;
|
||||
|
||||
bool prefix;
|
||||
bool timestamps;
|
||||
bool running;
|
||||
|
||||
int64_t t_start;
|
||||
|
||||
// ring buffer of entries
|
||||
std::vector<gpt_log_entry> entries;
|
||||
size_t head;
|
||||
size_t tail;
|
||||
|
||||
// worker thread copies into this
|
||||
gpt_log_entry cur;
|
||||
|
||||
public:
|
||||
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
||||
std::lock_guard<std::mutex> lock(mtx);
|
||||
|
||||
if (!running) {
|
||||
// discard messages while the worker thread is paused
|
||||
return;
|
||||
}
|
||||
|
||||
auto & entry = entries[tail];
|
||||
|
||||
{
|
||||
// cannot use args twice, so make a copy in case we need to expand the buffer
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
|
||||
#if 1
|
||||
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
||||
if (n >= entry.msg.size()) {
|
||||
entry.msg.resize(n + 1);
|
||||
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
||||
}
|
||||
#else
|
||||
// hack for bolding arguments
|
||||
|
||||
std::stringstream ss;
|
||||
for (int i = 0; fmt[i] != 0; i++) {
|
||||
if (fmt[i] == '%') {
|
||||
ss << LOG_COL_BOLD;
|
||||
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
||||
ss << LOG_COL_DEFAULT;
|
||||
if (fmt[i] == 0) break;
|
||||
}
|
||||
ss << fmt[i];
|
||||
}
|
||||
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
||||
if (n >= entry.msg.size()) {
|
||||
entry.msg.resize(n + 1);
|
||||
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
entry.level = level;
|
||||
entry.prefix = prefix;
|
||||
entry.timestamp = 0;
|
||||
if (timestamps) {
|
||||
entry.timestamp = t_us() - t_start;
|
||||
}
|
||||
entry.is_end = false;
|
||||
|
||||
tail = (tail + 1) % entries.size();
|
||||
if (tail == head) {
|
||||
// expand the buffer
|
||||
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
||||
|
||||
size_t new_tail = 0;
|
||||
|
||||
do {
|
||||
new_entries[new_tail] = std::move(entries[head]);
|
||||
|
||||
head = (head + 1) % entries.size();
|
||||
new_tail = (new_tail + 1);
|
||||
} while (head != tail);
|
||||
|
||||
head = 0;
|
||||
tail = new_tail;
|
||||
|
||||
for (size_t i = tail; i < new_entries.size(); i++) {
|
||||
new_entries[i].msg.resize(256);
|
||||
}
|
||||
|
||||
entries = std::move(new_entries);
|
||||
}
|
||||
|
||||
cv.notify_one();
|
||||
}
|
||||
|
||||
void resume() {
|
||||
std::lock_guard<std::mutex> lock(mtx);
|
||||
|
||||
if (running) {
|
||||
return;
|
||||
}
|
||||
|
||||
running = true;
|
||||
|
||||
thrd = std::thread([this]() {
|
||||
while (true) {
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mtx);
|
||||
cv.wait(lock, [this]() { return head != tail; });
|
||||
|
||||
cur = entries[head];
|
||||
|
||||
head = (head + 1) % entries.size();
|
||||
}
|
||||
|
||||
if (cur.is_end) {
|
||||
break;
|
||||
}
|
||||
|
||||
cur.print(); // stdout and stderr
|
||||
|
||||
if (file) {
|
||||
cur.print(file);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void pause() {
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mtx);
|
||||
|
||||
if (!running) {
|
||||
return;
|
||||
}
|
||||
|
||||
running = false;
|
||||
|
||||
// push an entry to signal the worker thread to stop
|
||||
{
|
||||
auto & entry = entries[tail];
|
||||
entry.is_end = true;
|
||||
|
||||
tail = (tail + 1) % entries.size();
|
||||
}
|
||||
|
||||
cv.notify_one();
|
||||
}
|
||||
|
||||
thrd.join();
|
||||
}
|
||||
|
||||
void set_file(const char * path) {
|
||||
pause();
|
||||
|
||||
if (file) {
|
||||
fclose(file);
|
||||
}
|
||||
|
||||
if (path) {
|
||||
file = fopen(path, "w");
|
||||
} else {
|
||||
file = nullptr;
|
||||
}
|
||||
|
||||
resume();
|
||||
}
|
||||
|
||||
void set_colors(bool colors) {
|
||||
pause();
|
||||
|
||||
if (colors) {
|
||||
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
||||
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
||||
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
||||
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
||||
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
||||
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
||||
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
||||
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||
} else {
|
||||
for (size_t i = 0; i < g_col.size(); i++) {
|
||||
g_col[i] = "";
|
||||
}
|
||||
}
|
||||
|
||||
resume();
|
||||
}
|
||||
|
||||
void set_prefix(bool prefix) {
|
||||
std::lock_guard<std::mutex> lock(mtx);
|
||||
|
||||
this->prefix = prefix;
|
||||
}
|
||||
|
||||
void set_timestamps(bool timestamps) {
|
||||
std::lock_guard<std::mutex> lock(mtx);
|
||||
|
||||
this->timestamps = timestamps;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// public API
|
||||
//
|
||||
|
||||
struct gpt_log * gpt_log_init() {
|
||||
return new gpt_log;
|
||||
}
|
||||
|
||||
struct gpt_log * gpt_log_main() {
|
||||
static struct gpt_log log;
|
||||
|
||||
return &log;
|
||||
}
|
||||
|
||||
void gpt_log_pause(struct gpt_log * log) {
|
||||
log->pause();
|
||||
}
|
||||
|
||||
void gpt_log_resume(struct gpt_log * log) {
|
||||
log->resume();
|
||||
}
|
||||
|
||||
void gpt_log_free(struct gpt_log * log) {
|
||||
delete log;
|
||||
}
|
||||
|
||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
log->add(level, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
||||
log->set_file(file);
|
||||
}
|
||||
|
||||
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
||||
log->set_colors(colors);
|
||||
}
|
||||
|
||||
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
||||
log->set_prefix(prefix);
|
||||
}
|
||||
|
||||
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
||||
log->set_timestamps(timestamps);
|
||||
}
|
782
common/log.h
782
common/log.h
@ -1,724 +1,90 @@
|
||||
#pragma once
|
||||
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include "ggml.h" // for ggml_log_level
|
||||
|
||||
// --------------------------------
|
||||
//
|
||||
// Basic usage:
|
||||
//
|
||||
// --------
|
||||
//
|
||||
// The LOG() and LOG_TEE() macros are ready to go by default
|
||||
// they do not require any initialization.
|
||||
//
|
||||
// LOGLN() and LOG_TEELN() are variants which automatically
|
||||
// include \n character at the end of the log string.
|
||||
//
|
||||
// LOG() behaves exactly like printf, by default writing to a logfile.
|
||||
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
||||
//
|
||||
// Default logfile is named
|
||||
// "llama.<threadID>.log"
|
||||
// Default LOG_TEE() secondary output target is
|
||||
// stderr
|
||||
//
|
||||
// Logs can be dynamically disabled or enabled using functions:
|
||||
// log_disable()
|
||||
// and
|
||||
// log_enable()
|
||||
//
|
||||
// A log target can be changed with:
|
||||
// log_set_target( string )
|
||||
// creating and opening, or re-opening a file by string filename
|
||||
// or
|
||||
// log_set_target( FILE* )
|
||||
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
||||
//
|
||||
// --------
|
||||
//
|
||||
// End of Basic usage.
|
||||
//
|
||||
// --------------------------------
|
||||
|
||||
// Specifies a log target.
|
||||
// default uses log_handler() with "llama.log" log file
|
||||
// this can be changed, by defining LOG_TARGET
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET (a valid FILE*)
|
||||
// #include "log.h"
|
||||
//
|
||||
// or it can be simply redirected to stdout or stderr
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET stderr
|
||||
// #include "log.h"
|
||||
//
|
||||
// The log target can also be redirected to a different function
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_different()
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_different()
|
||||
// {
|
||||
// return stderr;
|
||||
// }
|
||||
//
|
||||
// or:
|
||||
//
|
||||
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
||||
// #include "log.h"
|
||||
//
|
||||
// FILE* log_handler_another_one(char*filename)
|
||||
// {
|
||||
// static FILE* logfile = nullptr;
|
||||
// (...)
|
||||
// if( !logfile )
|
||||
// {
|
||||
// fopen(...)
|
||||
// }
|
||||
// (...)
|
||||
// return logfile
|
||||
// }
|
||||
//
|
||||
#ifndef LOG_TARGET
|
||||
#define LOG_TARGET log_handler()
|
||||
#endif
|
||||
|
||||
#ifndef LOG_TEE_TARGET
|
||||
#define LOG_TEE_TARGET stderr
|
||||
#endif
|
||||
|
||||
// Utility for synchronizing log configuration state
|
||||
// since std::optional was introduced only in c++17
|
||||
enum LogTriState
|
||||
{
|
||||
LogTriStateSame,
|
||||
LogTriStateFalse,
|
||||
LogTriStateTrue
|
||||
};
|
||||
|
||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||
inline std::string log_get_pid()
|
||||
{
|
||||
static std::string pid;
|
||||
if (pid.empty())
|
||||
{
|
||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// trying to write to the same log.
|
||||
std::stringstream ss;
|
||||
ss << std::this_thread::get_id();
|
||||
pid = ss.str();
|
||||
}
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
// Utility function for generating log file names with unique id based on thread id.
|
||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||
// where the number is a runtime id of the current thread.
|
||||
|
||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
||||
{
|
||||
static bool _multilog = false;
|
||||
|
||||
if (multilog != LogTriStateSame)
|
||||
{
|
||||
_multilog = multilog == LogTriStateTrue;
|
||||
}
|
||||
|
||||
std::stringstream buf;
|
||||
|
||||
buf << log_file_basename;
|
||||
if (_multilog)
|
||||
{
|
||||
buf << ".";
|
||||
buf << log_get_pid();
|
||||
}
|
||||
buf << ".";
|
||||
buf << log_file_extension;
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
#ifndef LOG_DEFAULT_FILE_NAME
|
||||
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
||||
#endif
|
||||
|
||||
// Utility for turning #define values into string literals
|
||||
// so we can have a define for stderr and
|
||||
// we can print "stderr" instead of literal stderr, etc.
|
||||
#define LOG_STRINGIZE1(s) #s
|
||||
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
||||
|
||||
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
||||
|
||||
// Allows disabling timestamps.
|
||||
// in order to disable, define LOG_NO_TIMESTAMPS
|
||||
// like so:
|
||||
//
|
||||
// #define LOG_NO_TIMESTAMPS
|
||||
// #include "log.h"
|
||||
//
|
||||
#ifndef LOG_NO_TIMESTAMPS
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#ifndef __GNUC__
|
||||
# define LOG_ATTRIBUTE_FORMAT(...)
|
||||
#elif defined(__MINGW32__)
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||
#else
|
||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TIMESTAMP_FMT "%s"
|
||||
#define LOG_TIMESTAMP_VAL ,""
|
||||
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||
#endif
|
||||
|
||||
#ifdef LOG_TEE_TIMESTAMPS
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#else
|
||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
||||
#define LOG_TEE_TIMESTAMP_VAL ,""
|
||||
#endif
|
||||
#define LOG_DEFAULT_DEBUG 1
|
||||
#define LOG_DEFAULT_LLAMA 0
|
||||
|
||||
// Allows disabling file/line/function prefix
|
||||
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
||||
// like so:
|
||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||
// set via gpt_log_set_verbosity()
|
||||
extern int gpt_log_verbosity_thold;
|
||||
|
||||
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||
|
||||
// the gpt_log uses an internal worker thread to print/write log messages
|
||||
// when the worker thread is paused, incoming log messages are discarded
|
||||
struct gpt_log;
|
||||
|
||||
struct gpt_log * gpt_log_init();
|
||||
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
|
||||
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
|
||||
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
|
||||
void gpt_log_free (struct gpt_log * log);
|
||||
|
||||
LOG_ATTRIBUTE_FORMAT(3, 4)
|
||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
|
||||
|
||||
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
||||
//
|
||||
// #define LOG_NO_FILE_LINE_FUNCTION
|
||||
// #include "log.h"
|
||||
// regular log output:
|
||||
//
|
||||
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_FLF_FMT "%s"
|
||||
#define LOG_FLF_VAL ,""
|
||||
#endif
|
||||
|
||||
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
||||
#ifndef _MSC_VER
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "%s"
|
||||
#define LOG_TEE_FLF_VAL ,""
|
||||
#endif
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
// USE LOG() INSTEAD
|
||||
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||
// llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||
// llm_load_tensors: offloading 32 repeating layers to GPU
|
||||
// llm_load_tensors: offloading non-repeating layers to GPU
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||
#define LOG_IMPL(str, ...) \
|
||||
// with prefix = true, timestamps = true, the log output will look like this:
|
||||
//
|
||||
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||
//
|
||||
// I - info (stdout, V = 0)
|
||||
// W - warning (stderr, V = 0)
|
||||
// E - error (stderr, V = 0)
|
||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||
//
|
||||
|
||||
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
|
||||
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
|
||||
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
|
||||
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||
|
||||
// helper macros for logging
|
||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||
//
|
||||
// for example:
|
||||
//
|
||||
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
||||
//
|
||||
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
|
||||
//
|
||||
|
||||
#define LOG_TMPL(level, verbosity, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
if ((verbosity) <= gpt_log_verbosity_thold) { \
|
||||
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define LOG_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
// USE LOG_TEE() INSTEAD
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
||||
fflush(LOG_TEE_TARGET); \
|
||||
} \
|
||||
} while (0)
|
||||
#else
|
||||
#define LOG_TEE_IMPL(str, ...) \
|
||||
do { \
|
||||
if (LOG_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TARGET); \
|
||||
} \
|
||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
||||
{ \
|
||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
||||
fflush(LOG_TEE_TARGET); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||
|
||||
// The '\0' as a last argument, is a trick to bypass the silly
|
||||
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
||||
// so we can have a single macro which can be called just like printf.
|
||||
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||
|
||||
// Main LOG macro.
|
||||
// behaves like printf, and supports arguments the exact same way.
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// Main TEE macro.
|
||||
// does the same as LOG
|
||||
// and
|
||||
// simultaneously writes stderr.
|
||||
//
|
||||
// Secondary target can be changed just like LOG_TARGET
|
||||
// by defining LOG_TEE_TARGET
|
||||
//
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// LOG macro variants with auto endline.
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||
#else
|
||||
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
||||
#endif
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
||||
{
|
||||
static bool _initialized = false;
|
||||
static bool _append = false;
|
||||
static bool _disabled = filename.empty() && target == nullptr;
|
||||
static std::string log_current_filename{filename};
|
||||
static FILE *log_current_target{target};
|
||||
static FILE *logfile = nullptr;
|
||||
|
||||
if (change)
|
||||
{
|
||||
if (append != LogTriStateSame)
|
||||
{
|
||||
_append = append == LogTriStateTrue;
|
||||
return logfile;
|
||||
}
|
||||
|
||||
if (disable == LogTriStateTrue)
|
||||
{
|
||||
// Disable primary target
|
||||
_disabled = true;
|
||||
}
|
||||
// If previously disabled, only enable, and keep previous target
|
||||
else if (disable == LogTriStateFalse)
|
||||
{
|
||||
_disabled = false;
|
||||
}
|
||||
// Otherwise, process the arguments
|
||||
else if (log_current_filename != filename || log_current_target != target)
|
||||
{
|
||||
_initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (_disabled)
|
||||
{
|
||||
// Log is disabled
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (_initialized)
|
||||
{
|
||||
// with fallback in case something went wrong
|
||||
return logfile ? logfile : stderr;
|
||||
}
|
||||
|
||||
// do the (re)initialization
|
||||
if (target != nullptr)
|
||||
{
|
||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||
{
|
||||
fclose(logfile);
|
||||
}
|
||||
|
||||
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
||||
log_current_target = target;
|
||||
|
||||
logfile = target;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (log_current_filename != filename)
|
||||
{
|
||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
||||
{
|
||||
fclose(logfile);
|
||||
}
|
||||
}
|
||||
|
||||
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
||||
}
|
||||
|
||||
if (!logfile)
|
||||
{
|
||||
// Verify whether the file was opened, otherwise fallback to stderr
|
||||
logfile = stderr;
|
||||
|
||||
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
||||
fflush(stderr);
|
||||
|
||||
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
||||
// otherwise we would repeatedly fopen() which was already unsuccessful
|
||||
}
|
||||
|
||||
_initialized = true;
|
||||
|
||||
return logfile ? logfile : stderr;
|
||||
}
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
||||
{
|
||||
return log_handler1_impl(change, append, disable, filename, target);
|
||||
}
|
||||
|
||||
// Disables logs entirely at runtime.
|
||||
// Makes LOG() and LOG_TEE() produce no output,
|
||||
// until enabled back.
|
||||
#define log_disable() log_disable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_disable_impl()
|
||||
{
|
||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
||||
}
|
||||
|
||||
// Enables logs at runtime.
|
||||
#define log_enable() log_enable_impl()
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_enable_impl()
|
||||
{
|
||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
||||
}
|
||||
|
||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
||||
#define log_set_target(target) log_set_target_impl(target)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
||||
|
||||
// Enable or disable creating separate log files for each run.
|
||||
// can ONLY be invoked BEFORE first log use.
|
||||
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
||||
// Enable or disable append mode for log file.
|
||||
// can ONLY be invoked BEFORE first log use.
|
||||
#define log_append(enable) log_append_impl(enable)
|
||||
// INTERNAL, DO NOT USE
|
||||
inline FILE *log_append_impl(bool enable)
|
||||
{
|
||||
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
||||
}
|
||||
|
||||
inline void log_test()
|
||||
{
|
||||
log_disable();
|
||||
LOG("01 Hello World to nobody, because logs are disabled!\n");
|
||||
log_enable();
|
||||
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
|
||||
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
|
||||
log_set_target(stderr);
|
||||
LOG("04 Hello World to stderr!\n");
|
||||
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
|
||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||
LOG("06 Hello World to default log file!\n");
|
||||
log_set_target(stdout);
|
||||
LOG("07 Hello World to stdout!\n");
|
||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
||||
LOG("08 Hello World to default log file again!\n");
|
||||
log_disable();
|
||||
LOG("09 Hello World _1_ into the void!\n");
|
||||
log_enable();
|
||||
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
|
||||
log_disable();
|
||||
log_set_target("llama.anotherlog.log");
|
||||
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
|
||||
log_enable();
|
||||
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
|
||||
log_set_target("llama.yetanotherlog.log");
|
||||
LOG("13 Hello World this time in yet new file?\n");
|
||||
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
||||
LOG("14 Hello World in log with generated filename!\n");
|
||||
#ifdef _MSC_VER
|
||||
LOG_TEE("15 Hello msvc TEE without arguments\n");
|
||||
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
|
||||
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
|
||||
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
|
||||
LOG("19 Hello msvc LOG without arguments\n");
|
||||
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
|
||||
LOGLN("21 Hello msvc LOGLN without arguments\n");
|
||||
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
|
||||
#endif
|
||||
}
|
||||
|
||||
inline bool log_param_single_parse(const std::string & param)
|
||||
{
|
||||
if ( param == "--log-test")
|
||||
{
|
||||
log_test();
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( param == "--log-disable")
|
||||
{
|
||||
log_disable();
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( param == "--log-enable")
|
||||
{
|
||||
log_enable();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (param == "--log-new")
|
||||
{
|
||||
log_multilog(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (param == "--log-append")
|
||||
{
|
||||
log_append(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
||||
{
|
||||
if ( param == "--log-file")
|
||||
{
|
||||
if (!check_but_dont_parse)
|
||||
{
|
||||
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void log_print_usage()
|
||||
{
|
||||
printf("log options:\n");
|
||||
/* format
|
||||
printf(" -h, --help show this help message and exit\n");*/
|
||||
/* spacing
|
||||
printf("__-param----------------Description\n");*/
|
||||
printf(" --log-test Run simple logging test\n");
|
||||
printf(" --log-disable Disable trace logs\n");
|
||||
printf(" --log-enable Enable trace logs\n");
|
||||
printf(" --log-file Specify a log filename (without extension)\n");
|
||||
printf(" --log-new Create a separate new log file on start. "
|
||||
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
||||
printf(" --log-append Don't truncate the old log file.\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||
|
||||
// INTERNAL, DO NOT USE
|
||||
inline void log_dump_cmdline_impl(int argc, char **argv)
|
||||
{
|
||||
std::stringstream buf;
|
||||
for (int i = 0; i < argc; ++i)
|
||||
{
|
||||
if (std::string(argv[i]).find(' ') != std::string::npos)
|
||||
{
|
||||
buf << " \"" << argv[i] <<"\"";
|
||||
}
|
||||
else
|
||||
{
|
||||
buf << " " << argv[i];
|
||||
}
|
||||
}
|
||||
LOGLN("Cmd:%s", buf.str().c_str());
|
||||
}
|
||||
|
||||
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
||||
|
||||
inline std::string log_var_to_string_impl(bool var)
|
||||
{
|
||||
return var ? "true" : "false";
|
||||
}
|
||||
|
||||
inline std::string log_var_to_string_impl(std::string var)
|
||||
{
|
||||
return var;
|
||||
}
|
||||
|
||||
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
||||
{
|
||||
std::stringstream buf;
|
||||
buf << "[ ";
|
||||
bool first = true;
|
||||
for (auto e : var)
|
||||
{
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
buf << ", ";
|
||||
}
|
||||
buf << std::to_string(e);
|
||||
}
|
||||
buf << " ]";
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
template <typename C, typename T>
|
||||
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
||||
{
|
||||
std::stringstream buf;
|
||||
buf << "[ ";
|
||||
|
||||
bool first = true;
|
||||
for (const auto & token : tokens)
|
||||
{
|
||||
if (!first) {
|
||||
buf << ", ";
|
||||
} else {
|
||||
first = false;
|
||||
}
|
||||
|
||||
auto detokenized = llama_token_to_piece(ctx, token);
|
||||
|
||||
detokenized.erase(
|
||||
std::remove_if(
|
||||
detokenized.begin(),
|
||||
detokenized.end(),
|
||||
[](const unsigned char c) { return !std::isprint(c); }),
|
||||
detokenized.end());
|
||||
|
||||
buf
|
||||
<< "'" << detokenized << "'"
|
||||
<< ":" << std::to_string(token);
|
||||
}
|
||||
buf << " ]";
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
template <typename C, typename B>
|
||||
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
|
||||
{
|
||||
std::stringstream buf;
|
||||
buf << "[ ";
|
||||
|
||||
bool first = true;
|
||||
for (int i = 0; i < batch.n_tokens; ++i)
|
||||
{
|
||||
if (!first) {
|
||||
buf << ", ";
|
||||
} else {
|
||||
first = false;
|
||||
}
|
||||
|
||||
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
||||
|
||||
detokenized.erase(
|
||||
std::remove_if(
|
||||
detokenized.begin(),
|
||||
detokenized.end(),
|
||||
[](const unsigned char c) { return !std::isprint(c); }),
|
||||
detokenized.end());
|
||||
|
||||
buf
|
||||
<< "\n" << std::to_string(i)
|
||||
<< ":token '" << detokenized << "'"
|
||||
<< ":pos " << std::to_string(batch.pos[i])
|
||||
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
||||
<< ":logits " << std::to_string(batch.logits[i]);
|
||||
}
|
||||
buf << " ]";
|
||||
|
||||
return buf.str();
|
||||
}
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
|
||||
#undef LOG
|
||||
#define LOG(...) // dummy stub
|
||||
#undef LOGLN
|
||||
#define LOGLN(...) // dummy stub
|
||||
|
||||
#undef LOG_TEE
|
||||
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
||||
|
||||
#undef LOG_TEELN
|
||||
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
||||
|
||||
#undef LOG_DISABLE
|
||||
#define LOG_DISABLE() // dummy stub
|
||||
|
||||
#undef LOG_ENABLE
|
||||
#define LOG_ENABLE() // dummy stub
|
||||
|
||||
#undef LOG_ENABLE
|
||||
#define LOG_ENABLE() // dummy stub
|
||||
|
||||
#undef LOG_SET_TARGET
|
||||
#define LOG_SET_TARGET(...) // dummy stub
|
||||
|
||||
#undef LOG_DUMP_CMDLINE
|
||||
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
||||
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
||||
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||
|
@ -2,8 +2,11 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cinttypes>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
|
||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||
|
@ -2,6 +2,9 @@
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <unordered_map>
|
||||
|
||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||
// TODO: deduplicate with llama-impl.h
|
||||
template<typename T>
|
||||
@ -139,7 +142,7 @@ std::string gpt_sampler_params::print() const {
|
||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||
|
||||
lparams.no_perf = false; // TODO: control via params
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
auto * result = new gpt_sampler {
|
||||
/* .params = */ params,
|
||||
@ -254,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||
// TODO: measure grammar performance
|
||||
|
||||
if (gsmpl) {
|
||||
llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_sampler_print(gsmpl->chain);
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
||||
return cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
|
||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
||||
return llama_sampler_get_seed(gsmpl->chain);
|
||||
}
|
||||
|
||||
// helpers
|
||||
|
||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||
@ -318,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
||||
}
|
||||
|
||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||
std::string result = "\tlogits ";
|
||||
std::string result = "logits ";
|
||||
|
||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||
@ -420,7 +427,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
||||
}
|
||||
|
||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||
|
@ -2,61 +2,11 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
enum gpt_sampler_type {
|
||||
GPT_SAMPLER_TYPE_NONE = 0,
|
||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||
};
|
||||
|
||||
// sampling parameters
|
||||
struct gpt_sampler_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
bool ignore_eos = false;
|
||||
|
||||
std::vector<enum gpt_sampler_type> samplers = {
|
||||
GPT_SAMPLER_TYPE_TOP_K,
|
||||
GPT_SAMPLER_TYPE_TFS_Z,
|
||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||
GPT_SAMPLER_TYPE_TOP_P,
|
||||
GPT_SAMPLER_TYPE_MIN_P,
|
||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||
};
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
|
||||
// print the parameters into a string
|
||||
std::string print() const;
|
||||
};
|
||||
|
||||
// gpt_sampler extends llama_sampler with additional functionality:
|
||||
//
|
||||
// - grammar support
|
||||
@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||
//
|
||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||
|
||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
||||
|
||||
// helpers
|
||||
|
||||
// access the internal list of current candidate tokens
|
||||
|
@ -1,9 +1,11 @@
|
||||
#include "train.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <functional>
|
||||
#include <cstring>
|
||||
|
||||
struct random_normal_distribution {
|
||||
std::mt19937 gen;
|
||||
|
@ -131,12 +131,14 @@ class Model:
|
||||
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
|
||||
tensor_names_from_parts: set[str] = set()
|
||||
|
||||
if len(self.part_names) > 1:
|
||||
self.tensor_names = set()
|
||||
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
|
||||
index_name += ".index.json"
|
||||
index_file = self.dir_model / index_name
|
||||
|
||||
if index_file.is_file():
|
||||
self.tensor_names = set()
|
||||
logger.info(f"gguf: loading model weight map from '{index_name}'")
|
||||
with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
|
||||
with open(index_file, "r", encoding="utf-8") as f:
|
||||
index: dict[str, Any] = json.load(f)
|
||||
weight_map = index.get("weight_map")
|
||||
if weight_map is None or not isinstance(weight_map, dict):
|
||||
@ -144,6 +146,7 @@ class Model:
|
||||
self.tensor_names.update(weight_map.keys())
|
||||
else:
|
||||
self.tensor_names = tensor_names_from_parts
|
||||
weight_map = {}
|
||||
|
||||
for part_name in self.part_names:
|
||||
logger.info(f"gguf: loading model part '{part_name}'")
|
||||
@ -170,9 +173,17 @@ class Model:
|
||||
data = LazyTorchTensor.from_eager(data)
|
||||
yield name, data
|
||||
|
||||
# only verify tensor name presence; it doesn't matter if they are not in the right files
|
||||
if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
||||
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
||||
# verify tensor name presence and identify potentially missing files
|
||||
if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
|
||||
missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
|
||||
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
||||
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
||||
if len(extra) == 0 and len(missing_files) > 0:
|
||||
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
||||
else:
|
||||
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
||||
f"Missing tensors: {missing}\n"
|
||||
f"Extra tensors: {extra}")
|
||||
|
||||
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||
@ -305,6 +316,8 @@ class Model:
|
||||
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||
)
|
||||
)
|
||||
or not new_name.endswith(".weight")
|
||||
@ -627,6 +640,9 @@ class Model:
|
||||
if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
|
||||
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
|
||||
res = "exaone"
|
||||
if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
|
||||
# ref: https://huggingface.co/microsoft/phi-2
|
||||
res = "phi-2"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@ -1485,7 +1501,7 @@ class StableLMModel(Model):
|
||||
raise ValueError(f"Unprocessed norms: {norms}")
|
||||
|
||||
|
||||
@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||
@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
|
||||
class LlamaModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
|
||||
@ -1839,6 +1855,60 @@ class MiniCPMModel(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("MiniCPM3ForCausalLM")
|
||||
class MiniCPM3Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.MINICPM3
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
|
||||
rope_dims = hparams["qk_rope_head_dim"]
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(self.block_count)
|
||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
|
||||
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
|
||||
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
|
||||
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
|
||||
|
||||
rope_scaling = self.find_hparam(['rope_scaling'], True)
|
||||
if rope_scaling is None:
|
||||
return
|
||||
|
||||
long_factors = rope_scaling.get('long_factor', None)
|
||||
short_factors = rope_scaling.get('short_factor', None)
|
||||
|
||||
if long_factors is None or short_factors is None:
|
||||
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
|
||||
|
||||
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
|
||||
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
|
||||
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
|
||||
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_llama_hf()
|
||||
|
||||
def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
|
||||
if n_kv_head is not None and n_head != n_kv_head:
|
||||
n_head //= n_kv_head
|
||||
|
||||
return (
|
||||
weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(weights.shape)
|
||||
)
|
||||
|
||||
|
||||
@Model.register("QWenLMHeadModel")
|
||||
class QwenModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN
|
||||
@ -2778,6 +2848,8 @@ class Rwkv6Model(Model):
|
||||
self.gguf_writer.add_tokenizer_model("rwkv")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
@ -2946,6 +3018,66 @@ class OlmoModel(Model):
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@Model.register("OlmoeForCausalLM")
|
||||
class OlmoeModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OLMOE
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||
if (n_experts := self.hparams.get("num_experts")) is not None:
|
||||
self.gguf_writer.add_expert_count(n_experts)
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
# Copied from: Qwen2MoeModel
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# process the experts separately
|
||||
if name.find("experts") != -1:
|
||||
n_experts = self.hparams["num_experts"]
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for w_name in ["down_proj", "gate_proj", "up_proj"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
# Copied from: Qwen2MoeModel
|
||||
def prepare_tensors(self):
|
||||
super().prepare_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@Model.register("JinaBertModel", "JinaBertForMaskedLM")
|
||||
class JinaBertV2Model(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
|
||||
|
@ -31,6 +31,7 @@ import re
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from hashlib import sha256
|
||||
from enum import IntEnum, auto
|
||||
@ -97,6 +98,7 @@ models = [
|
||||
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
|
||||
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
|
||||
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
|
||||
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
|
||||
]
|
||||
|
||||
|
||||
@ -125,6 +127,21 @@ def download_model(model):
|
||||
if tokt == TOKENIZER_TYPE.UGM:
|
||||
files.append("spiece.model")
|
||||
|
||||
if os.path.isdir(repo):
|
||||
# If repo is a path on the file system, copy the directory
|
||||
for file in files:
|
||||
src_path = os.path.join(repo, file)
|
||||
dst_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(dst_path):
|
||||
logger.info(f"{name}: File {dst_path} already exists - skipping")
|
||||
continue
|
||||
if os.path.isfile(src_path):
|
||||
shutil.copy2(src_path, dst_path)
|
||||
logger.info(f"{name}: Copied {src_path} to {dst_path}")
|
||||
else:
|
||||
logger.warning(f"{name}: Source file {src_path} does not exist")
|
||||
else:
|
||||
# If repo is a URL, download the files
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
|
@ -367,7 +367,13 @@ if __name__ == '__main__':
|
||||
yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
dest = super().modify_tensors(data_torch, name, bid)
|
||||
dest = list(super().modify_tensors(data_torch, name, bid))
|
||||
# some archs may have the same tensor for lm_head and output (tie word embeddings)
|
||||
# in this case, adapters targeting lm_head will fail when using llama-export-lora
|
||||
# therefore, we ignore them for now
|
||||
# see: https://github.com/ggerganov/llama.cpp/issues/9065
|
||||
if name == "lm_head.weight" and len(dest) == 0:
|
||||
raise ValueError("lm_head is present in adapter, but is ignored in base model")
|
||||
for dest_name, dest_data in dest:
|
||||
assert isinstance(dest_data, LoraTorchTensor)
|
||||
lora_a, lora_b = dest_data.get_lora_A_B()
|
||||
|
@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
|
||||
### Android
|
||||
|
||||
To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
### Arm CPU optimized mulmat kernels
|
||||
|
||||
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||
|
||||
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||
|
@ -1,47 +1,28 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
// mutates the input string
|
||||
static std::vector<int> parse_list(char * p) {
|
||||
std::vector<int> ret;
|
||||
|
||||
char * q = p;
|
||||
|
||||
while (*p) {
|
||||
if (*p == ',') {
|
||||
*p = '\0';
|
||||
ret.push_back(std::atoi(q));
|
||||
q = p + 1;
|
||||
}
|
||||
|
||||
++p;
|
||||
}
|
||||
|
||||
ret.push_back(std::atoi(q));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
int is_pp_shared = params.is_pp_shared;
|
||||
|
||||
std::vector<int> n_pp = params.n_pp;
|
||||
@ -98,7 +79,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -115,17 +96,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.batched_bench_output_jsonl) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
LOG("\n");
|
||||
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG("\n");
|
||||
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
}
|
||||
|
||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||
@ -155,7 +136,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -177,7 +158,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@ -195,21 +176,21 @@ int main(int argc, char ** argv) {
|
||||
const float speed = n_kv / t;
|
||||
|
||||
if(params.batched_bench_output_jsonl) {
|
||||
LOG_TEE(
|
||||
LOG(
|
||||
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||
);
|
||||
} else {
|
||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
@ -218,7 +199,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -140,8 +140,6 @@ while n_cur <= n_len {
|
||||
|
||||
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
||||
|
||||
llama_sampler_accept(smpl, new_token_id)
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
i_batch[i] = -1
|
||||
@ -202,8 +200,8 @@ let t_main_end = ggml_time_us()
|
||||
|
||||
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
|
||||
|
||||
llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
|
||||
llama_perf_print(UnsafeRawPointer(smpl), LLAMA_PERF_TYPE_SAMPLER_CHAIN)
|
||||
llama_perf_sampler_print(smpl)
|
||||
llama_perf_context_print(context)
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let utf8Count = text.utf8.count
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
@ -7,9 +9,9 @@
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@ -18,11 +20,11 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// number of parallel batches
|
||||
int n_parallel = params.n_parallel;
|
||||
@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
|
||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
LOG("\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// create a llama_batch
|
||||
// we use this object to submit token data for decoding
|
||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
||||
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (llama_model_has_encoder(model)) {
|
||||
if (llama_encode(ctx, batch)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
|
||||
//}
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||
}
|
||||
|
||||
// main loop
|
||||
@ -172,14 +172,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
||||
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
|
||||
// is it an end of generation? -> mark the stream as finished
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
||||
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
||||
}
|
||||
|
||||
continue;
|
||||
@ -187,8 +185,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if there is only one stream, we print immediately to stdout
|
||||
if (n_parallel == 1) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
}
|
||||
|
||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||
@ -210,29 +207,27 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
|
||||
if (n_parallel > 1) {
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
||||
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
|
@ -183,7 +183,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
||||
|
||||
TENSOR_DUMP(gf->nodes[0]);
|
||||
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
@ -224,7 +224,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
|
||||
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
printf("=====================================================================================\n");
|
||||
@ -252,7 +252,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
|
||||
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <climits>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cinttypes>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
|
||||
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
||||
try {
|
||||
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
|
||||
w->rms_att_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||
|
||||
w->wq.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
|
||||
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||
|
||||
w->wo.resize(p->n_layers * p->dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||
|
||||
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||
|
||||
w->rms_final_weight.resize(p->dim);
|
||||
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||
|
||||
if (shared_weights) {
|
||||
w->wcls = {};
|
||||
} else {
|
||||
w->wcls.resize(p->vocab_size * p->dim);
|
||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||
}
|
||||
}
|
||||
catch (std::length_error &) {
|
||||
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
||||
fseek(f, 0, SEEK_END);
|
||||
auto end = ftell(f);
|
||||
if (curr != end) {
|
||||
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
||||
}
|
||||
|
||||
static void print_sample_weights(TransformerWeights *w){
|
||||
LOG("----- Quick print of first of the weight vales of all the variables\n");
|
||||
LOG("%f\n", w->token_embedding_table[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
LOG("%f\n", w->rms_ffn_weight[0]);
|
||||
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
|
||||
LOG_INF("%f\n", w->token_embedding_table[0]);
|
||||
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||
LOG_INF("%f\n", w->rms_ffn_weight[0]);
|
||||
|
||||
LOG("%f\n", w->wq[0]);
|
||||
LOG("%f\n", w->wk[0]);
|
||||
LOG("%f\n", w->wv[0]);
|
||||
LOG("%f\n", w->wo[0]);
|
||||
LOG("%f\n", w->w1[0]);
|
||||
LOG("%f\n", w->w2[0]);
|
||||
LOG("%f\n", w->w3[0]);
|
||||
LOG("%f\n", w->rms_att_weight[0]);
|
||||
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
|
||||
LOG_INF("%f\n", w->wq[0]);
|
||||
LOG_INF("%f\n", w->wk[0]);
|
||||
LOG_INF("%f\n", w->wv[0]);
|
||||
LOG_INF("%f\n", w->wo[0]);
|
||||
LOG_INF("%f\n", w->w1[0]);
|
||||
LOG_INF("%f\n", w->w2[0]);
|
||||
LOG_INF("%f\n", w->w3[0]);
|
||||
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -318,20 +319,20 @@ struct train_params {
|
||||
};
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
LOG("%s: n_head: %u\n", __func__, params->n_head);
|
||||
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
|
||||
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||
}
|
||||
|
||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
LOG("%s: Allocating ", __func__);
|
||||
LOG_INF("%s: Allocating ", __func__);
|
||||
int64_t total = 1;
|
||||
int i = 0;
|
||||
for (; i < ggml_n_dims(t); ++i) {
|
||||
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
||||
|
||||
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
||||
if (is_ggml_file(filename)) {
|
||||
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||
struct ggml_context * ctx_data = NULL;
|
||||
|
||||
struct gguf_init_params params = {
|
||||
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
||||
gguf_free(ctx);
|
||||
} else {
|
||||
// assume llama2.c vocabulary
|
||||
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||
llama_file file(filename, "rb");
|
||||
if (!file.fp) {
|
||||
die_fmt("%s: %s", strerror(errno), filename);
|
||||
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_init();
|
||||
|
||||
struct train_params params = get_default_train_params();
|
||||
if (!params_parse(argc, argv, ¶ms)) {
|
||||
return 1;
|
||||
}
|
||||
log_set_target(stdout);
|
||||
|
||||
Config config;
|
||||
TransformerWeights weights = {};
|
||||
{
|
||||
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
||||
if (!file) {
|
||||
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
// read in the config header
|
||||
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
||||
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
auto shared_weights = config.vocab_size > 0;
|
||||
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
|
||||
// read in the Transformer weights
|
||||
alloc_weights(&weights, &config, shared_weights);
|
||||
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
||||
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||
return 1;
|
||||
}
|
||||
fclose(file);
|
||||
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
|
||||
model.name = basename(params.fn_llama2c_model);
|
||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||
|
||||
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||
|
||||
ggml_free(model.ctx);
|
||||
return 0;
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
@ -12,14 +13,15 @@
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <climits>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <climits>
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
@ -388,8 +390,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -12,12 +12,9 @@
|
||||
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#define DEBUG_POS 5
|
||||
|
||||
@ -229,8 +226,8 @@ static ggml_status compute_piter(
|
||||
result.eigenvectors.resize(params.n_batch);
|
||||
result.distances.resize(params.n_batch);
|
||||
// get output nodes
|
||||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
auto node = gf->nodes[i];
|
||||
for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
|
||||
auto node = ggml_graph_node(gf, i);
|
||||
int iter = -1;
|
||||
// find b_tensor (without copying data from device)
|
||||
if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <ctime>
|
||||
@ -38,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// run model
|
||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||
// encoder-only model
|
||||
if (llama_encode(ctx, batch) < 0) {
|
||||
fprintf(stderr, "%s : failed to encode\n", __func__);
|
||||
LOG_ERR("%s : failed to encode\n", __func__);
|
||||
}
|
||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||
// decoder-only model
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
@ -79,19 +81,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
params.embedding = true;
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@ -101,7 +100,7 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -111,19 +110,19 @@ int main(int argc, char ** argv) {
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
|
||||
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||
fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
||||
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
// split the prompt into lines
|
||||
@ -138,7 +137,7 @@ int main(int argc, char ** argv) {
|
||||
for (const auto & prompt : prompts) {
|
||||
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||
if (inp.size() > n_batch) {
|
||||
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||
return 1;
|
||||
}
|
||||
@ -149,20 +148,20 @@ int main(int argc, char ** argv) {
|
||||
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||
for (auto & inp : inputs) {
|
||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
||||
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
||||
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// tokenization stats
|
||||
if (params.verbose_prompt) {
|
||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -213,57 +212,57 @@ int main(int argc, char ** argv) {
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
|
||||
if (params.embd_out.empty()) {
|
||||
fprintf(stdout, "\n");
|
||||
LOG("\n");
|
||||
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
for (int j = 0; j < n_embd_count; j++) {
|
||||
fprintf(stdout, "embedding %d: ", j);
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
fprintf(stdout, " ... ");
|
||||
LOG(" ... ");
|
||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
LOG("\n");
|
||||
}
|
||||
} else {
|
||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
fprintf(stdout, "embedding %d: ", j);
|
||||
LOG("embedding %d: ", j);
|
||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||
if (params.embd_normalize == 0) {
|
||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||
} else {
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
// print cosine similarity matrix
|
||||
if (n_prompts > 1) {
|
||||
fprintf(stdout, "\n");
|
||||
printf("cosine similarity matrix:\n\n");
|
||||
LOG("\n");
|
||||
LOG("cosine similarity matrix:\n\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||
LOG("%6.6s ", prompts[i].c_str());
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
LOG("\n");
|
||||
for (int i = 0; i < n_prompts; i++) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
fprintf(stdout, "%6.2f ", sim);
|
||||
LOG("%6.2f ", sim);
|
||||
}
|
||||
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||
fprintf(stdout, "\n");
|
||||
LOG("%1.10s", prompts[i].c_str());
|
||||
LOG("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -272,43 +271,43 @@ int main(int argc, char ** argv) {
|
||||
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||
const bool notArray = params.embd_out != "array";
|
||||
|
||||
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||
fprintf(stdout, "[");
|
||||
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||
LOG("[");
|
||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||
i++;
|
||||
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||
if (i < n_embd) LOG(","); else break;
|
||||
}
|
||||
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||
LOG(notArray ? "]\n }" : "]");
|
||||
j++;
|
||||
if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
|
||||
}
|
||||
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||
LOG(notArray ? "\n ]" : "]\n");
|
||||
|
||||
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||
LOG(",\n \"cosineSimilarity\": [\n");
|
||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
fprintf(stdout, " [");
|
||||
LOG(" [");
|
||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||
fprintf(stdout, "%6.2f", sim);
|
||||
LOG("%6.2f", sim);
|
||||
j++;
|
||||
if (j < n_embd_count) fprintf(stdout, ", "); else break;
|
||||
if (j < n_embd_count) LOG(", "); else break;
|
||||
}
|
||||
fprintf(stdout, " ]");
|
||||
LOG(" ]");
|
||||
i++;
|
||||
if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
|
||||
if (i < n_embd_count) LOG(",\n"); else break;
|
||||
}
|
||||
fprintf(stdout, "\n ]");
|
||||
LOG("\n ]");
|
||||
}
|
||||
|
||||
if (notArray) fprintf(stdout, "\n}\n");
|
||||
if (notArray) LOG("\n}\n");
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(batch);
|
||||
|
@ -1,11 +1,11 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
printf(" [\n");
|
||||
LOG(" [\n");
|
||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||
if (i2 == n && ne[2] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
LOG(" ..., \n");
|
||||
i2 = ne[2] - n;
|
||||
}
|
||||
printf(" [\n");
|
||||
LOG(" [\n");
|
||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||
if (i1 == n && ne[1] > 2*n) {
|
||||
printf(" ..., \n");
|
||||
LOG(" ..., \n");
|
||||
i1 = ne[1] - n;
|
||||
}
|
||||
printf(" [");
|
||||
LOG(" [");
|
||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||
if (i0 == n && ne[0] > 2*n) {
|
||||
printf("..., ");
|
||||
LOG("..., ");
|
||||
i0 = ne[0] - n;
|
||||
}
|
||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||
@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||
} else {
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
printf("%12.4f", v);
|
||||
LOG("%12.4f", v);
|
||||
sum += v;
|
||||
if (i0 < ne[0] - 1) printf(", ");
|
||||
if (i0 < ne[0] - 1) LOG(", ");
|
||||
}
|
||||
printf("],\n");
|
||||
LOG("],\n");
|
||||
}
|
||||
printf(" ],\n");
|
||||
LOG(" ],\n");
|
||||
}
|
||||
printf(" ]\n");
|
||||
printf(" sum = %f\n", sum);
|
||||
LOG(" ]\n");
|
||||
LOG(" sum = %f\n", sum);
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,7 +102,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||
}
|
||||
|
||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||
src0->name, ggml_ne_string(src0).c_str(),
|
||||
src1 ? src1_str : "",
|
||||
@ -132,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -144,12 +144,11 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
gpt_init();
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
LOG_ERR("%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
bool OK = run(ctx, params);
|
||||
@ -181,8 +181,8 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
@ -369,7 +370,7 @@ struct lora_merge_ctx {
|
||||
|
||||
// write data to output file
|
||||
{
|
||||
auto result = gf->nodes[gf->n_nodes - 1];
|
||||
auto * result = ggml_graph_node(gf, -1);
|
||||
size_t len = ggml_nbytes(result);
|
||||
if (read_buf.size() < len) {
|
||||
read_buf.resize(len);
|
||||
@ -401,12 +402,11 @@ static void print_usage(int, char ** argv) {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
g_verbose = (params.verbosity == 1);
|
||||
g_verbose = (params.verbosity > 1);
|
||||
try {
|
||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||
ctx.run_merge();
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include <fstream>
|
||||
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
|
||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||
|
||||
gpt_params params;
|
||||
auto options = gpt_params_parser_init(params, ex);
|
||||
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||
|
||||
file << "| Argument | Explanation |\n";
|
||||
file << "| -------- | ----------- |\n";
|
||||
for (auto & opt : options) {
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
file << "| `";
|
||||
// args
|
||||
for (const auto & arg : opt.args) {
|
||||
|
@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 2) {
|
||||
if (argc - arg_idx != 2) {
|
||||
throw std::invalid_argument("error: bad arguments");
|
||||
}
|
||||
|
||||
@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) {
|
||||
int n_split = 1;
|
||||
int total_tensors = 0;
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
// avoid overwriting existing output file
|
||||
if (std::ifstream(split_params.output.c_str())) {
|
||||
fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
std::ofstream fout(split_params.output.c_str(), std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
|
||||
std::vector<uint8_t> read_data;
|
||||
std::vector<ggml_context *> ctx_metas;
|
||||
std::vector<gguf_context *> ctx_ggufs;
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||
llama_decode(ctx, bat);
|
||||
|
||||
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
||||
llama_sampler_accept(smpl, token);
|
||||
|
||||
if (token == eos_token) {
|
||||
break;
|
||||
@ -154,11 +154,12 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
||||
int main(int argc, char * argv[]) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
@ -18,12 +20,12 @@
|
||||
#endif
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s \\\n"
|
||||
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s \\\n"
|
||||
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
|
||||
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
||||
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
struct Stats {
|
||||
@ -124,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
exit(1); //GGML_ABORT("fatal error");
|
||||
}
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||
}
|
||||
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||
// loop over all possible experts, regardless if they are used or not in the batch
|
||||
for (int ex = 0; ex < n_as; ++ex) {
|
||||
size_t e_start = ex*src1->ne[0];
|
||||
@ -150,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
e.values[e_start + j] += x[j]*x[j];
|
||||
e.counts[e_start + j]++;
|
||||
if (!std::isfinite(e.values[e_start + j])) {
|
||||
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
||||
LOG("\n");
|
||||
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -173,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||
e.counts.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ABORT("fatal error");
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
e.counts[j]++;
|
||||
if (!std::isfinite(e.values[j])) {
|
||||
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
|
||||
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -238,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
}
|
||||
|
||||
if (n_zeros != 0 && is_first) {
|
||||
fprintf(stderr, "\n");
|
||||
LOG_INF("\n");
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
if (n_zeros == n_all) {
|
||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n_zeros > 0) {
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -257,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
}
|
||||
|
||||
if (to_store.size() < m_stats.size()) {
|
||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||
}
|
||||
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
@ -289,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
||||
out.write(m_params.prompt_file.c_str(), len);
|
||||
}
|
||||
|
||||
if (m_params.verbosity > 0) {
|
||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
||||
}
|
||||
LOGV(1, "\n");
|
||||
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
||||
}
|
||||
|
||||
bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||
std::ifstream in(fname, std::ios::binary);
|
||||
if (!in) {
|
||||
printf("%s: failed to open %s\n",__func__, fname);
|
||||
LOG_ERR("%s: failed to open %s\n",__func__, fname);
|
||||
return false;
|
||||
}
|
||||
int n_entries;
|
||||
in.read((char*)&n_entries, sizeof(n_entries));
|
||||
if (in.fail() || n_entries < 1) {
|
||||
printf("%s: no data in file %s\n", __func__, fname);
|
||||
LOG_ERR("%s: no data in file %s\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < n_entries; ++i) {
|
||||
@ -311,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||
std::vector<char> name_as_vec(len+1);
|
||||
in.read((char *)name_as_vec.data(), len);
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
||||
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
||||
return false;
|
||||
}
|
||||
name_as_vec[len] = 0;
|
||||
@ -322,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||
int nval;
|
||||
in.read((char *)&nval, sizeof(nval));
|
||||
if (in.fail() || nval < 1) {
|
||||
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||
m_stats = {};
|
||||
return false;
|
||||
}
|
||||
@ -335,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||
std::vector<float> tmp(nval);
|
||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
|
||||
m_stats = {};
|
||||
return false;
|
||||
}
|
||||
@ -436,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (params.i_chunk > 0) {
|
||||
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
||||
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
||||
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
||||
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
||||
}
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
|
||||
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -477,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
double nll = 0.0;
|
||||
double nll2 = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
||||
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
||||
@ -513,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -530,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
if (i == 0) {
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
LOG("%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
if (params.compute_ppl) {
|
||||
const int first = n_ctx/2;
|
||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
fflush(stdout);
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
if (params.compute_ppl) {
|
||||
nll2 /= count;
|
||||
@ -561,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
LOG("Unexpected negative standard deviation of log(prob)\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -575,27 +572,27 @@ int main(int argc, char ** argv) {
|
||||
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
params.verbosity = 1;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
g_collector.set_params(params);
|
||||
|
||||
for (const auto & in_file : params.in_files) {
|
||||
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||
if (!g_collector.load_imatrix(in_file.c_str())) {
|
||||
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
|
||||
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (params.in_files.size() > 1) {
|
||||
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
||||
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
||||
g_collector.save_imatrix();
|
||||
}
|
||||
|
||||
@ -614,20 +611,20 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == nullptr || ctx == nullptr) {
|
||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
||||
LOG_ERR("%s : failed to init\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
if (!compute_imatrix(ctx, params)) {
|
||||
@ -636,8 +633,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
g_collector.save_imatrix();
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
@ -1,6 +1,8 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "console.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
@ -54,7 +56,7 @@ static void write_logfile(
|
||||
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
return;
|
||||
}
|
||||
@ -63,7 +65,7 @@ static void write_logfile(
|
||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||
|
||||
if (logfile == NULL) {
|
||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -92,7 +94,7 @@ static void sigint_handler(int signo) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
gpt_perf_print(*g_ctx, *g_smpl);
|
||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||
_exit(130);
|
||||
@ -105,63 +107,55 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto & sparams = params.sparams;
|
||||
gpt_init();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("infill", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
printf("************\n\n");
|
||||
LOG_ERR("\n************\n");
|
||||
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@ -174,34 +168,32 @@ int main(int argc, char ** argv) {
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
const bool add_bos = llama_add_bos_token(model);
|
||||
GGML_ASSERT(!llama_add_eos_token(model));
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
std::vector<llama_token> embd_end;
|
||||
@ -226,18 +218,19 @@ int main(int argc, char ** argv) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG_DBG("add_bos: %d\n", add_bos);
|
||||
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -246,9 +239,8 @@ int main(int argc, char ** argv) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||
|
||||
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
if (params.interactive_first) {
|
||||
@ -256,21 +248,21 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > 0) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_TEE("'\n");
|
||||
LOG("'\n");
|
||||
}
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
@ -287,25 +279,30 @@ int main(int argc, char ** argv) {
|
||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_TEE("Input prefix with BOS\n");
|
||||
LOG_INF("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
smpl = gpt_sampler_init(model, sparams);
|
||||
|
||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
LOG("\n");
|
||||
LOG("\n##### Infill mode #####\n\n");
|
||||
if (params.interactive) {
|
||||
const char *control_message;
|
||||
if (params.multiline_input) {
|
||||
@ -316,11 +313,11 @@ int main(int argc, char ** argv) {
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_TEE("== Running in interactive mode. ==\n");
|
||||
LOG("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
||||
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_TEE( "%s\n", control_message);
|
||||
LOG( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
@ -340,8 +337,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
smpl = gpt_sampler_init(model, sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
@ -355,9 +350,8 @@ int main(int argc, char ** argv) {
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// infinite text generation via context swapping
|
||||
@ -366,14 +360,14 @@ int main(int argc, char ** argv) {
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
@ -381,9 +375,9 @@ int main(int argc, char ** argv) {
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG("after swap: n_past = %d\n", n_past);
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
}
|
||||
|
||||
@ -395,16 +389,16 @@ int main(int argc, char ** argv) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG("n_past = %d\n", n_past);
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
}
|
||||
|
||||
}
|
||||
@ -416,7 +410,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_sampler_accept(smpl, id, true);
|
||||
|
||||
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
@ -426,10 +420,10 @@ int main(int argc, char ** argv) {
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
@ -448,7 +442,7 @@ int main(int argc, char ** argv) {
|
||||
if (input_echo) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
if (embd.size() > 1) {
|
||||
input_tokens.push_back(id);
|
||||
@ -457,7 +451,6 @@ int main(int argc, char ** argv) {
|
||||
output_ss << token_str;
|
||||
}
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
@ -470,10 +463,9 @@ int main(int argc, char ** argv) {
|
||||
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||
if (is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
std::string buffer;
|
||||
std::string line;
|
||||
@ -529,35 +521,33 @@ int main(int argc, char ** argv) {
|
||||
n_remain = params.n_predict;
|
||||
n_past = 0;
|
||||
n_consumed = 0;
|
||||
// LOG_TEE("took new input\n");
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||
LOG("found EOS token\n");
|
||||
LOG_DBG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
console::set_display(console::user_input);
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||
LOG("waiting for user input\n");
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
buffer += params.input_prefix;
|
||||
printf("%s", buffer.c_str());
|
||||
LOG("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
@ -575,17 +565,17 @@ int main(int argc, char ** argv) {
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
buffer += params.input_suffix;
|
||||
printf("%s", params.input_suffix.c_str());
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG("buffer: '%s'\n", buffer.c_str());
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
@ -596,9 +586,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG("empty line, passing control back\n");
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
@ -625,11 +615,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
if (!params.interactive && n_remain <= 0) {
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
fflush(stdout);
|
||||
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||
|
||||
@ -639,9 +628,5 @@ int main(int argc, char ** argv) {
|
||||
gpt_sampler_free(smpl);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
LOG_TEE("Log end\n");
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
|
||||
fflush(p_err->fout);
|
||||
}
|
||||
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
|
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
// sample the most likely token
|
||||
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
||||
|
||||
llama_sampler_accept(sampler, new_token_id);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
return nullptr;
|
||||
|
@ -152,8 +152,6 @@ actor LlamaContext {
|
||||
|
||||
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
||||
|
||||
llama_sampler_accept(sampling, new_token_id)
|
||||
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
print("\n")
|
||||
is_done = true
|
||||
|
@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
|
||||
3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
```
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
python ./examples/llava/convert_image_encoder_to_gguf.py \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B_V2 \
|
||||
@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
|
||||
4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
|
||||
python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
|
||||
```sh
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
@ -3,7 +3,6 @@
|
||||
// I'll gradually clean and extend it
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "log.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
@ -40,6 +39,11 @@
|
||||
#include <cinttypes>
|
||||
#include <limits>
|
||||
|
||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
||||
int i = gguf_find_key(ctx, key);
|
||||
if (i == -1) {
|
||||
LOG_TEE("key %s not found in file\n", key);
|
||||
LOG_ERR("key %s not found in file\n", key);
|
||||
throw std::runtime_error(format("Missing required key: %s", key));
|
||||
}
|
||||
|
||||
@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
|
||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||
}
|
||||
@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -568,7 +572,7 @@ struct clip_ctx {
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
if (load_image_size == nullptr) {
|
||||
load_image_size = clip_image_size_init();
|
||||
}
|
||||
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
image_size_width = load_image_size->width;
|
||||
image_size_height = load_image_size->height;
|
||||
if (is_inf) {
|
||||
@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
|
||||
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
||||
}
|
||||
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
|
||||
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
|
||||
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
||||
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
||||
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
LOG_INF("\n");
|
||||
}
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
// kv
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
__func__, n_kv, n_tensors, fname);
|
||||
{
|
||||
std::map<enum ggml_type, uint32_t> n_type;
|
||||
@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
n_type[type]++;
|
||||
}
|
||||
|
||||
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
replace_all(value, "\n", "\\n");
|
||||
|
||||
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
size_t tensor_size = ggml_nbytes(cur);
|
||||
model_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
|
||||
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_clip->backend = ggml_backend_metal_init();
|
||||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
||||
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
new_clip->backend = ggml_backend_cann_init(0);
|
||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
new_clip->backend = ggml_backend_vk_init(0);
|
||||
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
||||
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||
#endif
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
|
||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
|
||||
if (verbosity >= 1) {
|
||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||
|
||||
// load tensors
|
||||
{
|
||||
@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
new_clip->ctx_data = ggml_init(params);
|
||||
if (!new_clip->ctx_data) {
|
||||
LOG_TEE("%s: ggml_init() failed\n", __func__);
|
||||
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
LOG_TEE("cannot open model file for loading tensors\n");
|
||||
LOG_ERR("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
|
||||
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
LOG_TEE("\n%s: vision model hparams\n", __func__);
|
||||
LOG_TEE("image_size %d\n", hparams.image_size);
|
||||
LOG_TEE("patch_size %d\n", hparams.patch_size);
|
||||
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
|
||||
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
|
||||
LOG_TEE("v_n_head %d\n", hparams.n_head);
|
||||
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
|
||||
LOG_TEE("v_eps %f\n", hparams.eps);
|
||||
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
LOG_TEE("v_image_grid_pinpoints: ");
|
||||
LOG_INF("\n%s: vision model hparams\n", __func__);
|
||||
LOG_INF("image_size %d\n", hparams.image_size);
|
||||
LOG_INF("patch_size %d\n", hparams.patch_size);
|
||||
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
||||
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
||||
LOG_INF("v_n_head %d\n", hparams.n_head);
|
||||
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
||||
LOG_INF("v_eps %f\n", hparams.eps);
|
||||
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
LOG_INF("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
|
||||
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
|
||||
}
|
||||
|
||||
@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
} catch(const std::exception& /*e*/) {
|
||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
||||
// LLaVA projection
|
||||
@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
try {
|
||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
} catch (std::runtime_error & /*e*/) { }
|
||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
|
||||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_TEE("%s: failed to decode image bytes\n", __func__);
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
@ -1754,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
||||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
@ -1872,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
|
||||
std::vector<std::vector<clip_image_u8 *>> images;
|
||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
||||
images.push_back(std::vector<clip_image_u8 *>());
|
||||
|
||||
if (multiple <= 1) {
|
||||
@ -1887,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
||||
clip_image_u8 * source_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||
images[images.size()-1].push_back(source_image);
|
||||
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||
|
||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||
|
||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||
|
||||
// split_to_patches
|
||||
int width = refine_image->nx;
|
||||
@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
int idx = 0;
|
||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||
res_imgs->data[idx++] = *res;
|
||||
@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < patches.size(); i++) {
|
||||
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||
clip_image_u8_free(patches[i]);
|
||||
}
|
||||
|
||||
@ -2279,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
||||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
||||
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2449,7 +2453,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_graph_compute(ctx->backend, gf);
|
||||
|
||||
// the last node is the embedding tensor
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
||||
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
new_type = type;
|
||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
}
|
||||
const size_t n_elms = ggml_nelements(cur);
|
||||
float * f32_data;
|
||||
@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
f32_data = (float *)conv_buf.data();
|
||||
break;
|
||||
default:
|
||||
LOG_TEE("Please use an input file in f32 or f16\n");
|
||||
LOG_ERR("Please use an input file in f32 or f16\n");
|
||||
gguf_free(ctx_out);
|
||||
return false;
|
||||
}
|
||||
@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
fout.put(0);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
gguf_free(ctx_out);
|
||||
|
||||
{
|
||||
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -1,14 +1,16 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "base64.hpp"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include "base64.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||
@ -19,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
@ -74,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
||||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -88,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
||||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
|
||||
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -113,9 +115,9 @@ struct llava_context {
|
||||
};
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\n example usage:\n");
|
||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
LOG("\n example usage:\n");
|
||||
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
||||
@ -125,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
||||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||
if (!embed) {
|
||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
@ -155,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
|
||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@ -176,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -187,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||
if (!smpl) {
|
||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
@ -201,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
printf("%s", tmp);
|
||||
LOG("%s", tmp);
|
||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||
@ -210,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||
}
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(gpt_params * params) {
|
||||
@ -221,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
@ -244,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
@ -267,65 +269,54 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("llava", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
gpt_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
print_usage(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
auto model = llava_init(¶ms);
|
||||
|
||||
auto * model = llava_init(¶ms);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (prompt_contains_image(params.prompt)) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
} else {
|
||||
for (auto & image : params.image) {
|
||||
auto ctx_llava = llava_init_context(¶ms, model);
|
||||
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||
|
||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||
if (!image_embed) {
|
||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
||||
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
llava_image_embed_free(image_embed);
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
@ -1,13 +1,23 @@
|
||||
#include "clip.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "llava.h"
|
||||
#include "base64.hpp"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
|
||||
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||
|
||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
|
||||
// RGB uint8 image
|
||||
struct clip_image_u8 {
|
||||
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
||||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
@ -184,7 +194,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
|
||||
ggml_build_forward_expand(gf, flatten);
|
||||
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
|
||||
struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor* result = ggml_graph_node(gf, -1);
|
||||
|
||||
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
|
||||
// append without newline tokens (default behavior in llava_arch when not using unpad ):
|
||||
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
}
|
||||
@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||
}
|
||||
if (!encoded) {
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
int n_img_pos_out = 0;
|
||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||
@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
load_image_size->width = img->nx;
|
||||
load_image_size->height = img->ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
delete[] img_res_v.data;
|
||||
if (!encoded) {
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
LOG_ERR("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded) {
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||
}
|
||||
|
||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
|
||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||
|
||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
||||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
if (n_image_embd != n_llama_embd) {
|
||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
||||
}
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||
if (!image_embd) {
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n_img_pos;
|
||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
||||
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
||||
free(image_embd);
|
||||
return false;
|
||||
}
|
||||
@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
}
|
||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
||||
clip_image_u8 * img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||
if (!image_embed_result) {
|
||||
clip_image_u8_free(img);
|
||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
||||
auto file = fopen(path, "rb");
|
||||
if (file == NULL) {
|
||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
||||
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
||||
|
||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||
if (buffer == NULL) {
|
||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
perror("Memory allocation error");
|
||||
fclose(file);
|
||||
return false;
|
||||
@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
||||
long image_bytes_length;
|
||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||
if (!loaded) {
|
||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -1,13 +1,18 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
struct llava_context {
|
||||
struct clip_ctx * ctx_clip = NULL;
|
||||
@ -16,14 +21,8 @@ struct llava_context {
|
||||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llama_model * llava_init(gpt_params * params) {
|
||||
@ -34,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
return model;
|
||||
@ -49,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
if (params->n_ctx < 2048) {
|
||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||
ctx_params.n_ctx = 2048;
|
||||
} else {
|
||||
ctx_params.n_ctx = params->n_ctx;
|
||||
@ -58,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||
|
||||
ctx_llava->ctx_llama = ctx_llama;
|
||||
ctx_llava->model = model;
|
||||
@ -87,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
|
||||
if (prompt.empty()) {
|
||||
prompt = "describe the image in detail.";
|
||||
}
|
||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||
return ctx_clip;
|
||||
}
|
||||
|
||||
@ -99,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
||||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
@ -123,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
|
||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||
|
||||
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||
slice_embed->embed = image_embed;
|
||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||
@ -141,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||
else if (has_minicpmv_projector == 3) {
|
||||
system_prompt = "<|im_start|>user\n";
|
||||
}
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||
@ -160,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||
}
|
||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||
}
|
||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||
}
|
||||
|
||||
static const char * sample(struct gpt_sampler * smpl,
|
||||
@ -179,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
|
||||
}
|
||||
|
||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||
auto ctx_clip = clip_init_context(params);
|
||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
auto * ctx_clip = clip_init_context(params);
|
||||
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||
if (!embeds) {
|
||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// process the prompt
|
||||
if (params->prompt.empty() && params->interactive == false) {
|
||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
||||
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
auto model = llava_init(params);
|
||||
auto * model = llava_init(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||
auto ctx_llava = llava_init_context(params, model);
|
||||
auto * ctx_llava = llava_init_context(params, model);
|
||||
ctx_llava->ctx_clip = ctx_clip;
|
||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||
|
||||
const int64_t t_process_image_start_us = ggml_time_us();
|
||||
process_image(ctx_llava, embeds, params, n_past);
|
||||
const int64_t t_process_image_end_us = ggml_time_us();
|
||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||
|
||||
llava_image_embed_free(embeds);
|
||||
return ctx_llava;
|
||||
}
|
||||
|
||||
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
||||
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||
std::string user_prompt = prompt;
|
||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||
if (!is_first) {
|
||||
@ -236,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
|
||||
|
||||
// generate the response
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
|
||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||
return smpl;
|
||||
@ -253,17 +252,11 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("llava", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
gpt_init();
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty())) {
|
||||
show_additional_info(argc, argv);
|
||||
@ -272,21 +265,23 @@ int main(int argc, char ** argv) {
|
||||
|
||||
for (auto & image : params.image) {
|
||||
int n_past = 0;
|
||||
auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||
|
||||
if (!params.prompt.empty()) {
|
||||
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
||||
LOG_TEE("<assistant>");
|
||||
auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
||||
LOG("<user>%s\n", params.prompt.c_str());
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response = "";
|
||||
std::string response;
|
||||
bool have_tmp = false;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0){
|
||||
if(!have_tmp)continue;
|
||||
else break;
|
||||
if (!have_tmp) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
have_tmp = true;
|
||||
@ -298,15 +293,15 @@ int main(int argc, char ** argv) {
|
||||
gpt_sampler_free(smpl);
|
||||
}else {
|
||||
while (true) {
|
||||
LOG_TEE("<user>");
|
||||
LOG("<user>");
|
||||
std::string prompt;
|
||||
std::getline(std::cin, prompt);
|
||||
LOG_TEE("<assistant>");
|
||||
auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
LOG("<assistant>");
|
||||
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||
std::string response = "";
|
||||
std::string response;
|
||||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||
response += tmp;
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
@ -318,7 +313,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_llava->ctx_llama);
|
||||
|
||||
ctx_llava->model = NULL;
|
||||
llava_free(ctx_llava);
|
||||
|
@ -1,4 +1,7 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
@ -36,23 +39,18 @@ struct ngram_container {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
const int W = 15; // lookahead window
|
||||
const int N = 5; // n-gram size
|
||||
const int G = 15; // max verification n-grams
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("lookahead", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -74,14 +72,14 @@ int main(int argc, char ** argv) {
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@ -165,7 +163,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
@ -255,7 +253,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
||||
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -292,10 +290,10 @@ int main(int argc, char ** argv) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
if (v == 0) {
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
} else {
|
||||
// print light cyan
|
||||
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
||||
LOG("\033[0;96m%s\033[0m", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
@ -329,21 +327,21 @@ int main(int argc, char ** argv) {
|
||||
// print known n-grams starting with token id (debug)
|
||||
if (0 && v == 0) {
|
||||
if (ngrams_observed.cnt[id] > 0) {
|
||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||
printf(" - ngram %2d: ", i);
|
||||
LOG(" - ngram %2d: ", i);
|
||||
|
||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||
|
||||
for (int j = 0; j < N - 1; j++) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -454,20 +452,20 @@ int main(int argc, char ** argv) {
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("W = %2d\n", W);
|
||||
LOG_TEE("N = %2d\n", N);
|
||||
LOG_TEE("G = %2d\n", G);
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("W = %2d\n", W);
|
||||
LOG_INF("N = %2d\n", N);
|
||||
LOG_INF("G = %2d\n", G);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_predict = %d\n", n_predict);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
@ -481,7 +479,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,7 +1,8 @@
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
@ -13,8 +14,7 @@
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||
|
||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,25 +1,26 @@
|
||||
#include "ggml.h"
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "llama.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
const int n_draft = params.n_draft;
|
||||
|
||||
// init llama.cpp
|
||||
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
|
||||
const int64_t eta_min = eta_ms / (60*1000);
|
||||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
||||
|
||||
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
||||
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
||||
}
|
||||
|
||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
|
||||
ngram_cache_context.clear();
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_draft = %d\n", n_draft);
|
||||
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
|
||||
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,7 +1,10 @@
|
||||
#include "arg.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ngram-cache.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
@ -12,22 +15,17 @@
|
||||
int main(int argc, char ** argv){
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.n_draft;
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("lookup", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -57,7 +55,7 @@ int main(int argc, char ** argv){
|
||||
try {
|
||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
@ -75,14 +73,14 @@ int main(int argc, char ** argv){
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
@ -123,7 +121,7 @@ int main(int argc, char ** argv){
|
||||
}
|
||||
|
||||
// print current draft sequence
|
||||
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
|
||||
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||
|
||||
int i_dft = 0;
|
||||
while (true) {
|
||||
@ -135,7 +133,7 @@ int main(int argc, char ** argv){
|
||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (llama_token_is_eog(model, id)) {
|
||||
@ -146,7 +144,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
// check if the target token matches the draft
|
||||
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
||||
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||
++n_accept;
|
||||
++n_past;
|
||||
++i_dft;
|
||||
@ -160,19 +158,19 @@ int main(int argc, char ** argv){
|
||||
|
||||
if (params.use_color) {
|
||||
// color accepted draft token
|
||||
printf("\033[34m%s\033[0m", token_str.c_str());
|
||||
LOG("\033[34m%s\033[0m", token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
|
||||
draft.clear();
|
||||
draft.push_back(id);
|
||||
@ -223,24 +221,23 @@ int main(int argc, char ** argv){
|
||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_draft = %d\n", n_draft);
|
||||
LOG_INF("n_predict = %d\n", n_predict);
|
||||
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG_INF("\ntarget:\n\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
|
||||
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite
|
||||
|
||||
If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
|
||||
|
||||
The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
|
||||
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
|
||||
|
||||
### Temperature
|
||||
|
@ -1,11 +1,11 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "console.h"
|
||||
#include "log.h"
|
||||
#include "sampling.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
@ -41,11 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
static bool need_insert_eot = false;
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
printf("\nexample usage:\n");
|
||||
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
printf("\n");
|
||||
static void print_usage(int argc, char ** argv) {
|
||||
(void) argc;
|
||||
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
static bool file_exists(const std::string & path) {
|
||||
@ -73,8 +75,7 @@ static void write_logfile(
|
||||
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -82,7 +83,7 @@ static void write_logfile(
|
||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||
|
||||
if (logfile == NULL) {
|
||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -112,7 +113,7 @@ static void sigint_handler(int signo) {
|
||||
need_insert_eot = true;
|
||||
} else {
|
||||
console::cleanup();
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
gpt_perf_print(*g_ctx, *g_smpl);
|
||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||
_exit(130);
|
||||
@ -121,80 +122,61 @@ static void sigint_handler(int signo) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||
llama_chat_msg new_msg{role, content};
|
||||
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||
chat_msgs.push_back({role, content});
|
||||
LOG("formatted: %s\n", formatted.c_str());
|
||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||
return formatted;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
g_params = ¶ms;
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// TODO: Dump params ?
|
||||
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
||||
|
||||
// save choice to use color for later
|
||||
// (note for later: this is a slightly awkward choice)
|
||||
console::init(params.simple_io, params.use_color);
|
||||
atexit([]() { console::cleanup(); });
|
||||
|
||||
if (params.logits_all) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
LOG_ERR("************\n");
|
||||
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
LOG_ERR("************\n");
|
||||
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
LOG_ERR("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||
}
|
||||
|
||||
if (params.rope_freq_scale != 0.0) {
|
||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
LOG_INF("%s: llama backend init\n", __func__);
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
LOG("%s: llama backend init\n", __func__);
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@ -209,21 +191,19 @@ int main(int argc, char ** argv) {
|
||||
g_smpl = &smpl;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s: llama threadpool init = n_threads = %d\n",
|
||||
__func__,
|
||||
(int) params.cpuparams.n_threads
|
||||
);
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
@ -235,8 +215,8 @@ int main(int argc, char ** argv) {
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
exit(1);
|
||||
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Start the non-batch threadpool in the paused state
|
||||
@ -245,55 +225,54 @@ int main(int argc, char ** argv) {
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print chat template example in conversation mode
|
||||
if (params.conversation) {
|
||||
if (params.enable_chat_template) {
|
||||
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||
} else {
|
||||
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
std::string path_session = params.path_prompt_cache;
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
||||
if (!path_session.empty()) {
|
||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||
if (!file_exists(path_session)) {
|
||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
||||
LOG_INF("%s: session file does not exist, will create.\n", __func__);
|
||||
} else if (file_is_empty(path_session)) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
return 1;
|
||||
}
|
||||
session_tokens.resize(n_token_count_out);
|
||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||
}
|
||||
}
|
||||
|
||||
@ -301,7 +280,8 @@ int main(int argc, char ** argv) {
|
||||
if (!llama_model_has_encoder(model)) {
|
||||
GGML_ASSERT(!llama_add_eos_token(model));
|
||||
}
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
||||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
@ -310,31 +290,31 @@ int main(int argc, char ** argv) {
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
: params.prompt;
|
||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||
LOG("tokenize the prompt\n");
|
||||
LOG_DBG("tokenize the prompt\n");
|
||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||
} else {
|
||||
LOG("use session tokens\n");
|
||||
LOG_DBG("use session tokens\n");
|
||||
embd_inp = session_tokens;
|
||||
}
|
||||
|
||||
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
|
||||
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
}
|
||||
|
||||
// Should not run without any tokens
|
||||
if (embd_inp.empty()) {
|
||||
if (add_bos) {
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||
} else {
|
||||
LOG_TEE("error: input is empty\n");
|
||||
LOG_ERR("input is empty\n");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenize negative prompt
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -348,14 +328,14 @@ int main(int argc, char ** argv) {
|
||||
n_matching_session_tokens++;
|
||||
}
|
||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||
LOG_TEE("%s: using full prompt from session file\n", __func__);
|
||||
LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
|
||||
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
} else {
|
||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||
__func__, n_matching_session_tokens, embd_inp.size());
|
||||
}
|
||||
|
||||
@ -363,14 +343,13 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
|
||||
LOGLN(
|
||||
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
|
||||
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||
|
||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||
// reevaluation of the last token to recalculate the cached logits
|
||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
|
||||
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||
|
||||
session_tokens.resize(embd_inp.size() - 1);
|
||||
}
|
||||
@ -392,21 +371,20 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (params.n_keep > add_bos) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
LOG_TEE("'\n");
|
||||
LOG("'\n");
|
||||
}
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
// ctrl+C handling
|
||||
@ -426,40 +404,40 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||
LOG("%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (!params.antiprompt.empty()) {
|
||||
for (const auto & antiprompt : params.antiprompt) {
|
||||
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG_TEE("Input prefix with BOS\n");
|
||||
LOG("Input prefix with BOS\n");
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.input_suffix.empty()) {
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
if (params.verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -467,13 +445,15 @@ int main(int argc, char ** argv) {
|
||||
|
||||
smpl = gpt_sampler_init(model, sparams);
|
||||
if (!smpl) {
|
||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
||||
exit(1);
|
||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
||||
LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||
|
||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
// group-attention state
|
||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||
@ -487,9 +467,9 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||
}
|
||||
LOG_TEE("\n\n");
|
||||
LOG("\n");
|
||||
|
||||
if (params.interactive) {
|
||||
const char * control_message;
|
||||
@ -501,11 +481,11 @@ int main(int argc, char ** argv) {
|
||||
" - To return control without starting a new line, end your input with '/'.\n"
|
||||
" - If you want to submit another line, end your input with '\\'.\n";
|
||||
}
|
||||
LOG_TEE("== Running in interactive mode. ==\n");
|
||||
LOG("== Running in interactive mode. ==\n");
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
||||
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||
#endif
|
||||
LOG_TEE( "%s\n", control_message);
|
||||
LOG( "%s\n", control_message);
|
||||
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
@ -544,7 +524,7 @@ int main(int argc, char ** argv) {
|
||||
llama_token * enc_input_buf = embd_inp.data();
|
||||
|
||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -570,9 +550,8 @@ int main(int argc, char ** argv) {
|
||||
embd.resize(max_embd_size);
|
||||
|
||||
console::set_display(console::error);
|
||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||
console::set_display(console::reset);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
if (ga_n == 1) {
|
||||
@ -580,16 +559,21 @@ int main(int argc, char ** argv) {
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
|
||||
if (n_past + (int) embd.size() >= n_ctx) {
|
||||
if (!params.ctx_shift){
|
||||
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
||||
break;
|
||||
} else {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
}
|
||||
|
||||
const int n_left = n_past - params.n_keep;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
@ -597,13 +581,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG("after swap: n_past = %d\n", n_past);
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
LOG("clear session path\n");
|
||||
LOG_DBG("clear session path\n");
|
||||
path_session.clear();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// context extension via Self-Extend
|
||||
while (n_past >= ga_i + ga_w) {
|
||||
@ -611,10 +596,10 @@ int main(int argc, char ** argv) {
|
||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
||||
|
||||
LOG("\n");
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
LOG_DBG("\n");
|
||||
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
||||
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||
@ -624,7 +609,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ga_i += ga_w/ga_n;
|
||||
|
||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
||||
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -656,19 +641,19 @@ int main(int argc, char ** argv) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
||||
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
n_past += n_eval;
|
||||
|
||||
LOG("n_past = %d\n", n_past);
|
||||
LOG_DBG("n_past = %d\n", n_past);
|
||||
// Display total tokens alongside total time
|
||||
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -686,14 +671,14 @@ int main(int argc, char ** argv) {
|
||||
need_to_save_session = false;
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
|
||||
LOG("saved session to %s\n", path_session.c_str());
|
||||
LOG_DBG("saved session to %s\n", path_session.c_str());
|
||||
}
|
||||
|
||||
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
|
||||
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
||||
|
||||
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
@ -703,16 +688,16 @@ int main(int argc, char ** argv) {
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
|
||||
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
@ -727,7 +712,7 @@ int main(int argc, char ** argv) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
@ -739,8 +724,6 @@ int main(int argc, char ** argv) {
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
@ -789,13 +772,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (is_antiprompt) {
|
||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
||||
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// deal with end of generation tokens in interactive mode
|
||||
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||
LOG("found an EOG token\n");
|
||||
LOG_DBG("found an EOG token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
if (!params.antiprompt.empty()) {
|
||||
@ -809,7 +792,7 @@ int main(int argc, char ** argv) {
|
||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||
}
|
||||
is_interacting = true;
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -820,21 +803,21 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
LOG("waiting for user input\n");
|
||||
LOG_DBG("waiting for user input\n");
|
||||
|
||||
if (params.conversation) {
|
||||
printf("\n> ");
|
||||
LOG("\n> ");
|
||||
}
|
||||
|
||||
if (params.input_prefix_bos) {
|
||||
LOG("adding input prefix BOS token\n");
|
||||
LOG_DBG("adding input prefix BOS token\n");
|
||||
embd_inp.push_back(llama_token_bos(model));
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty() && !params.conversation) {
|
||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
printf("%s", params.input_prefix.c_str());
|
||||
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
LOG("%s", params.input_prefix.c_str());
|
||||
}
|
||||
|
||||
// color user input only
|
||||
@ -857,11 +840,11 @@ int main(int argc, char ** argv) {
|
||||
if (buffer.length() > 1) {
|
||||
// append input suffix if any
|
||||
if (!params.input_suffix.empty() && !params.conversation) {
|
||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
printf("%s", params.input_suffix.c_str());
|
||||
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
LOG("%s", params.input_suffix.c_str());
|
||||
}
|
||||
|
||||
LOG("buffer: '%s'\n", buffer.c_str());
|
||||
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||
|
||||
const size_t original_size = embd_inp.size();
|
||||
|
||||
@ -878,7 +861,7 @@ int main(int argc, char ** argv) {
|
||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||
|
||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||
|
||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||
if (need_insert_eot && format_chat) {
|
||||
@ -901,9 +884,9 @@ int main(int argc, char ** argv) {
|
||||
assistant_ss.str("");
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
LOG("n_remain: %d\n", n_remain);
|
||||
LOG_DBG("n_remain: %d\n", n_remain);
|
||||
} else {
|
||||
LOG("empty line, passing control back\n");
|
||||
LOG_DBG("empty line, passing control back\n");
|
||||
}
|
||||
|
||||
input_echo = false; // do not echo this again
|
||||
@ -919,7 +902,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
LOG(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
@ -932,11 +915,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n\n");
|
||||
gpt_perf_print(ctx, smpl);
|
||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||
|
||||
@ -950,9 +933,5 @@ int main(int argc, char ** argv) {
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
LOG_TEE("Log end\n");
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,7 +1,10 @@
|
||||
// A basic application simulating a server with multiple clients.
|
||||
// The clients submit requests to the server and they are processed in parallel.
|
||||
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
@ -81,7 +84,9 @@ static void print_date_time() {
|
||||
char buffer[80];
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||
|
||||
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
// Define a split string function to ...
|
||||
@ -100,11 +105,12 @@ int main(int argc, char ** argv) {
|
||||
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// number of simultaneous "clients" to simulate
|
||||
const int32_t n_clients = params.n_parallel;
|
||||
|
||||
@ -119,12 +125,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const bool dump_kv_cache = params.dump_kv_cache;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("parallel", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -137,23 +137,22 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// load the prompts from an external file if there are any
|
||||
if (params.prompt.empty()) {
|
||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||
} else {
|
||||
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||
int index = 0;
|
||||
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||
|
||||
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||
for (const auto& prompt : prompts) {
|
||||
k_prompts.resize(index + 1);
|
||||
k_prompts[index] = prompt;
|
||||
index++;
|
||||
printf("%3d prompt: %s\n", index, prompt.c_str());
|
||||
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
fflush(stderr);
|
||||
LOG_INF("\n\n");
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
@ -182,19 +181,19 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_INF("\n");
|
||||
|
||||
{
|
||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
||||
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||
|
||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -203,10 +202,10 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
LOG_TEE("Processing requests ...\n\n");
|
||||
LOG_INF("Processing requests ...\n\n");
|
||||
|
||||
while (true) {
|
||||
if (dump_kv_cache) {
|
||||
@ -237,7 +236,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||
}
|
||||
|
||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
||||
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||
}
|
||||
|
||||
// insert new sequences for decoding
|
||||
@ -272,7 +271,7 @@ int main(int argc, char ** argv) {
|
||||
client.n_decoded = 0;
|
||||
client.i_batch = batch.n_tokens - 1;
|
||||
|
||||
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||
|
||||
g_seq_id += 1;
|
||||
|
||||
@ -316,11 +315,11 @@ int main(int argc, char ** argv) {
|
||||
if (ret != 0) {
|
||||
if (n_batch == 1 || ret < 0) {
|
||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||
|
||||
n_cache_miss += 1;
|
||||
|
||||
@ -331,7 +330,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||
|
||||
for (auto & client : clients) {
|
||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||
@ -376,7 +375,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||
(t_main_end - client.t_start_prompt) / 1e6,
|
||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||
@ -399,22 +398,22 @@ int main(int argc, char ** argv) {
|
||||
|
||||
print_date_time();
|
||||
|
||||
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||
if (params.prompt_file.empty()) {
|
||||
params.prompt_file = "used built-in defaults";
|
||||
}
|
||||
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||
|
||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
||||
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||
LOG_INF("Cache misses: %6d\n", n_cache_miss);
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
|
||||
// TODO: print sampling/grammar timings for all clients
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
@ -423,7 +422,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
@ -7,9 +9,9 @@
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@ -19,11 +21,12 @@ int main(int argc, char ** argv) {
|
||||
params.n_keep = 32;
|
||||
params.i_pos = -1;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
int n_junk = params.n_junk;
|
||||
int n_keep = params.n_keep;
|
||||
int n_grp = params.grp_attn_n;
|
||||
@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
|
||||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
|
||||
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
||||
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
|
||||
LOG_INF("prompt tokens: %d\n", n_tokens_all);
|
||||
//LOG_INF("prompt: %s\n", params.prompt.c_str());
|
||||
|
||||
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
|
||||
@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_INF("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||
|
||||
if (i + n_batch >= n_tokens_all) {
|
||||
break;
|
||||
@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
|
||||
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
||||
const int n_discard = n_batch;
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||
}
|
||||
|
||||
{
|
||||
const int n_discard = n_past - n_ctx + n_predict;
|
||||
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
||||
LOG_TEE("\n");
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
||||
LOG_INF("\n");
|
||||
|
||||
// main loop
|
||||
|
||||
int n_cur = n_tokens_all;
|
||||
int n_decode = 0;
|
||||
|
||||
LOG_TEE("%s", prompt_suffix.c_str());
|
||||
fflush(stdout);
|
||||
LOG_INF("%s", prompt_suffix.c_str());
|
||||
|
||||
const auto t_main_start = ggml_time_us();
|
||||
|
||||
@ -220,17 +222,14 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
|
||||
n_decode += 1;
|
||||
|
||||
@ -245,22 +244,22 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
LOG("\n");
|
||||
|
||||
llama_sampler_free(smpl);
|
||||
|
||||
|
@ -1,18 +1,21 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <mutex>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
@ -40,7 +43,7 @@ static void write_logfile(
|
||||
}
|
||||
|
||||
if (params.hellaswag) {
|
||||
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
||||
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -48,7 +51,7 @@ static void write_logfile(
|
||||
|
||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||
if (!success) {
|
||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
|
||||
__func__, params.logdir.c_str());
|
||||
return;
|
||||
}
|
||||
@ -57,7 +60,7 @@ static void write_logfile(
|
||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||
|
||||
if (logfile == NULL) {
|
||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -343,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
|
||||
@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
prob_history.resize(tokens.size());
|
||||
|
||||
if (params.ppl_stride <= 0) {
|
||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
const int calc_chunk = n_ctx;
|
||||
|
||||
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||
|
||||
if (int(tokens.size()) <= calc_chunk) {
|
||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||
tokens.size(), n_ctx, params.ppl_stride);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * params.ppl_stride;
|
||||
const int end = start + calc_chunk;
|
||||
|
||||
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
const int batch_start = start + j * n_batch;
|
||||
const int batch_size = std::min(end - batch_start, n_batch);
|
||||
|
||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
//LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
@ -433,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
|
||||
if (i == 0) {
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
LOG("%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
LOG("\n");
|
||||
|
||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
||||
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
@ -459,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
}
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
if (params.ppl_output_type == 0) {
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
} else {
|
||||
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||
}
|
||||
@ -487,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
if (!params.logits_file.empty()) {
|
||||
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
||||
if (!logits_stream.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
||||
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
||||
return {};
|
||||
}
|
||||
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||
logits_stream.write("_logits_", 8);
|
||||
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
||||
}
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
|
||||
@ -539,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
||||
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
||||
@ -612,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_INF("%s : failed to eval\n", __func__);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
@ -627,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
llama_synchronize(ctx);
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
LOG("%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
LOG("\n");
|
||||
|
||||
for (int seq = 0; seq < n_seq_batch; seq++) {
|
||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
|
||||
@ -655,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
if (params.ppl_output_type == 0) {
|
||||
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
||||
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
||||
} else {
|
||||
double av = nll/count;
|
||||
double av2 = nll2/count - av*av;
|
||||
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||
}
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
nll2 /= count;
|
||||
nll /= count;
|
||||
@ -675,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
|
||||
}
|
||||
|
||||
llama_batch_free(batch);
|
||||
@ -703,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
if (ret != 0) {
|
||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -789,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
if (prompt_lines.size() % 6 != 0) {
|
||||
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t hs_task_count = prompt_lines.size()/6;
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||
|
||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||
LOG_INF("================================= is_spm = %d\n", is_spm);
|
||||
|
||||
// The tasks should be randomized so the score stabilizes quickly.
|
||||
bool randomize_tasks = true;
|
||||
@ -824,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
std::vector<llama_token> seq_tokens[4];
|
||||
};
|
||||
|
||||
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||
|
||||
// Select and read data from prompt lines
|
||||
std::vector<hs_data_t> hs_data(hs_task_count);
|
||||
@ -870,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||
|
||||
printf("\ntask\tacc_norm\n");
|
||||
LOG("\ntask\tacc_norm\n");
|
||||
|
||||
double acc = 0.0f;
|
||||
|
||||
@ -940,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -948,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -998,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||
|
||||
// If the gold ending got the maximum logprobe add one accuracy point
|
||||
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
||||
@ -1006,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
// Print the accumulated accuracy mean x 100
|
||||
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||
fflush(stdout);
|
||||
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||
}
|
||||
|
||||
i0 = i1 - 1;
|
||||
@ -1015,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
struct winogrande_entry {
|
||||
@ -1059,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
||||
}
|
||||
}
|
||||
if (ipos != 4) {
|
||||
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||
continue;
|
||||
}
|
||||
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
||||
@ -1073,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
||||
if (sentence[where] == '_') break;
|
||||
}
|
||||
if (where == int(sentence.size())) {
|
||||
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||
continue;
|
||||
}
|
||||
std::istringstream stream(answer.c_str());
|
||||
int i_answer; stream >> i_answer;
|
||||
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
||||
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||
continue;
|
||||
}
|
||||
result.emplace_back();
|
||||
@ -1108,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
auto data = load_winogrande_from_csv(params.prompt);
|
||||
if (data.empty()) {
|
||||
fprintf(stderr, "%s: no tasks\n", __func__);
|
||||
LOG_ERR("%s: no tasks\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||
|
||||
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
||||
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||
std::mt19937 rng(1);
|
||||
std::vector<int> aux(data.size());
|
||||
for (int i = 0; i < int(data.size()); ++i) {
|
||||
@ -1133,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
data = std::move(selected);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
||||
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
||||
|
||||
for (auto & task : data) {
|
||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||
@ -1156,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
@ -1217,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1225,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1285,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
++n_done;
|
||||
|
||||
// print the accumulated accuracy mean x 100
|
||||
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||
fflush(stdout);
|
||||
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||
}
|
||||
|
||||
i0 = i1 - 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
if (n_done < 100) return;
|
||||
|
||||
const float p = 1.f*n_correct/n_done;
|
||||
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
||||
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||
|
||||
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||
}
|
||||
|
||||
static bool deserialize_string(std::istream & in, std::string & str) {
|
||||
@ -1347,7 +1349,7 @@ struct multiple_choice_task {
|
||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||
if (log_error) {
|
||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
||||
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -1355,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
||||
for (auto& answer : task.mc1.answers) {
|
||||
if (answer.empty()) {
|
||||
if (log_error) {
|
||||
printf("%s: found empty answer\n", __func__);
|
||||
LOG_ERR("%s: found empty answer\n", __func__);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -1409,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
uint32_t n_task;
|
||||
strstream.read((char *)&n_task, sizeof(n_task));
|
||||
if (strstream.fail() || n_task == 0) {
|
||||
printf("%s: no tasks\n", __func__);
|
||||
LOG_ERR("%s: no tasks\n", __func__);
|
||||
return;
|
||||
}
|
||||
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
|
||||
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
|
||||
std::vector<uint32_t> task_pos(n_task);
|
||||
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
||||
if (strstream.fail()) {
|
||||
printf("%s: failed to read task positions from prompt\n", __func__);
|
||||
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1424,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
||||
// Use all tasks
|
||||
tasks.resize(n_task);
|
||||
printf("%s: reading tasks", __func__);
|
||||
LOG_INF("%s: reading tasks", __func__);
|
||||
int n_dot = std::max((int) n_task/100, 1);
|
||||
int i = 0;
|
||||
for (auto& task : tasks) {
|
||||
++i;
|
||||
if (!task.deserialize(strstream)) {
|
||||
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
||||
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
||||
return;
|
||||
}
|
||||
if (i%n_dot == 0) printf(".");
|
||||
if (i%n_dot == 0) LOG(".");
|
||||
}
|
||||
printf("done\n");
|
||||
LOG("done\n");
|
||||
}
|
||||
else {
|
||||
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
||||
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
||||
std::mt19937 rng(1);
|
||||
std::vector<int> aux(n_task);
|
||||
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
||||
@ -1451,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
aux.pop_back();
|
||||
strstream.seekg(task_pos[idx], std::ios::beg);
|
||||
if (!task.deserialize(strstream)) {
|
||||
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
||||
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
n_task = params.multiple_choice_tasks;
|
||||
}
|
||||
|
||||
printf("%s: preparing task data", __func__);
|
||||
fflush(stdout);
|
||||
LOG_INF("%s: preparing task data", __func__);
|
||||
if (n_task > 500) {
|
||||
printf("...");
|
||||
fflush(stdout);
|
||||
LOG("...");
|
||||
std::atomic<int> counter(0);
|
||||
std::atomic<int> n_bad(0);
|
||||
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||
@ -1486,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
for (auto& w : workers) w = std::thread(prepare);
|
||||
prepare();
|
||||
for (auto& w : workers) w.join();
|
||||
printf("done\n");
|
||||
fflush(stdout);
|
||||
LOG("done\n");
|
||||
int nbad = n_bad;
|
||||
if (nbad > 0) {
|
||||
printf("%s: found %d malformed tasks\n", __func__, nbad);
|
||||
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
@ -1502,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
return;
|
||||
}
|
||||
if (i_task%n_dot == 0) {
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
LOG(".");
|
||||
}
|
||||
}
|
||||
printf("done\n");
|
||||
LOG("done\n");
|
||||
}
|
||||
|
||||
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
||||
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
||||
|
||||
printf("\ntask\tacc_norm\n");
|
||||
LOG("\ntask\tacc_norm\n");
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
@ -1590,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
}
|
||||
|
||||
if (i0 == i1) {
|
||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1598,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
|
||||
// decode all tasks [i0, i1)
|
||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1622,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
// compute the logprobs for each ending of the decoded tasks
|
||||
for (size_t i = i0; i < i1; ++i) {
|
||||
auto & cur_task = tasks[i];
|
||||
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
||||
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
||||
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
||||
// if (cur_task.mc1.labels[j] == 1) {
|
||||
// printf("%d", j+1);
|
||||
// LOG("%d", j+1);
|
||||
// }
|
||||
//}
|
||||
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
|
||||
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
||||
|
||||
// get the logits of the last token of the common prefix
|
||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
||||
@ -1640,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
size_t count = 1;
|
||||
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||
//printf(" %zu %g\n", ir, eval_results[ir]);
|
||||
//LOG(" %zu %g\n", ir, eval_results[ir]);
|
||||
++count;
|
||||
log_prob += eval_results[ir++];
|
||||
}
|
||||
cur_task.log_probs[s] = log_prob / count;
|
||||
//printf(" Final: %g\n", log_prob / count);
|
||||
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
||||
//LOG(" Final: %g\n", log_prob / count);
|
||||
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
||||
}
|
||||
|
||||
// Find the ending with maximum logprob
|
||||
@ -1666,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
++n_done;
|
||||
|
||||
// Print the accumulated accuracy mean x 100
|
||||
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
||||
fflush(stdout);
|
||||
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
||||
}
|
||||
|
||||
i0 = i1 - 1;
|
||||
@ -1679,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
|
||||
float p = 1.f*n_correct/n_done;
|
||||
float sigma = sqrt(p*(1-p)/(n_done-1));
|
||||
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||
LOG("\n");
|
||||
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||
p = 1.f*n_done/n_tot_answers;
|
||||
sigma = sqrt(p*(1-p)/(n_done-1));
|
||||
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||
|
||||
printf("\n");
|
||||
LOG_INF("\n");
|
||||
}
|
||||
|
||||
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
if (params.logits_file.empty()) {
|
||||
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
||||
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
||||
return;
|
||||
}
|
||||
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
||||
if (!in) {
|
||||
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
||||
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
||||
return;
|
||||
}
|
||||
{
|
||||
char check[9]; check[8] = 0;
|
||||
in.read(check, 8);
|
||||
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
||||
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
||||
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -1709,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
uint32_t n_ctx;
|
||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||
if (n_ctx > llama_n_ctx(ctx)) {
|
||||
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||
}
|
||||
|
||||
@ -1717,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
in.read((char *)&n_vocab, sizeof(n_vocab));
|
||||
in.read((char *)&n_chunk, sizeof(n_chunk));
|
||||
if (in.fail()) {
|
||||
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
||||
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
||||
return;
|
||||
}
|
||||
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
||||
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
||||
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
||||
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1775,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
||||
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
|
||||
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1796,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1813,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
if (i == 0) {
|
||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||
int total_seconds = (int)(t_total * n_chunk);
|
||||
if (total_seconds >= 60*60) {
|
||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
||||
LOG("%d hours ", total_seconds / (60*60));
|
||||
total_seconds = total_seconds % (60*60);
|
||||
}
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
|
||||
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
||||
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
LOG("\n");
|
||||
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
||||
|
||||
const int first = n_ctx/2;
|
||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
@ -1831,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
p_diff_ptr += n_ctx - 1 - first;
|
||||
kld_ptr += n_ctx - 1 - first;
|
||||
|
||||
printf("%4d", i+1);
|
||||
LOG("%4d", i+1);
|
||||
|
||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||
const double ppl_val = exp(log_ppl.first);
|
||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
||||
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
||||
|
||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||
|
||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
||||
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
||||
|
||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||
|
||||
double p_top_val = 1.*kld.n_same_top/kld.count;
|
||||
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
||||
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
||||
|
||||
printf("\n");
|
||||
|
||||
fflush(stdout);
|
||||
LOG("\n");
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
||||
|
||||
std::sort(kld_values.begin(), kld_values.end());
|
||||
std::sort(p_diff_values.begin(), p_diff_values.end());
|
||||
|
||||
printf("====== Perplexity statistics ======\n");
|
||||
LOG("====== Perplexity statistics ======\n");
|
||||
|
||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||
const double ppl_val = exp(log_ppl.first);
|
||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
||||
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
||||
|
||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||
const double ppl_base_val = exp(log_ppl_base.first);
|
||||
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
||||
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
||||
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
||||
|
||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
||||
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
||||
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
||||
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
||||
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
||||
|
||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||
|
||||
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
||||
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
||||
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
||||
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
||||
|
||||
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
||||
const double ppl_diff_val = ppl_val - ppl_base_val;
|
||||
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
||||
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
||||
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
||||
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
printf("====== KL divergence statistics ======\n");
|
||||
LOG("====== KL divergence statistics ======\n");
|
||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
||||
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
||||
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
||||
: kld_values[kld_values.size()/2];
|
||||
|
||||
@ -1915,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
||||
};
|
||||
|
||||
printf("Maximum KLD: %10.6f\n", kld_values.back());
|
||||
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||
printf("Median KLD: %10.6f\n", kld_median);
|
||||
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
||||
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
||||
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
||||
printf("Minimum KLD: %10.6f\n", kld_values.front());
|
||||
LOG("Maximum KLD: %10.6f\n", kld_values.back());
|
||||
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
||||
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||
LOG("Median KLD: %10.6f\n", kld_median);
|
||||
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
||||
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
||||
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
||||
LOG("Minimum KLD: %10.6f\n", kld_values.front());
|
||||
|
||||
printf("\n");
|
||||
LOG("\n");
|
||||
|
||||
printf("====== Token probability statistics ======\n");
|
||||
LOG("====== Token probability statistics ======\n");
|
||||
|
||||
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
||||
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
||||
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
||||
|
||||
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
||||
: p_diff_values[p_diff_values.size()/2];
|
||||
|
||||
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
||||
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
||||
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
||||
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
||||
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
||||
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
||||
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
||||
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
||||
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
||||
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
||||
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
||||
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
||||
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
||||
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
||||
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
||||
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
||||
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
||||
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
||||
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
||||
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
||||
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
||||
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
||||
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
||||
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
||||
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
||||
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
||||
|
||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
||||
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
||||
|
||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||
|
||||
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
||||
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
||||
|
||||
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@ -1967,15 +1962,16 @@ int main(int argc, char ** argv) {
|
||||
params.n_ctx = 512;
|
||||
params.logits_all = true;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
const int32_t n_ctx = params.n_ctx;
|
||||
|
||||
if (n_ctx <= 0) {
|
||||
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
||||
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -2000,15 +1996,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||
params.n_ctx += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
@ -2018,21 +2010,21 @@ int main(int argc, char ** argv) {
|
||||
llama_model * model = llama_init.model;
|
||||
llama_context * ctx = llama_init.context;
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
struct results_perplexity results;
|
||||
@ -2048,8 +2040,9 @@ int main(int argc, char ** argv) {
|
||||
results = perplexity(ctx, params, n_ctx);
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
write_logfile(ctx, params, model, results);
|
||||
|
||||
llama_free(ctx);
|
||||
|
@ -1,6 +1,6 @@
|
||||
set(TARGET llama-quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
|
||||
|
||||
*(outdated)*
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||
|
@ -1,13 +1,16 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
struct chunk {
|
||||
@ -16,7 +19,7 @@ struct chunk {
|
||||
// original file position
|
||||
size_t filepos;
|
||||
// original text data
|
||||
std::string textdata = "";
|
||||
std::string textdata;
|
||||
// tokenized text data
|
||||
std::vector<llama_token> tokens;
|
||||
// embedding
|
||||
@ -30,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||
std::ifstream f(filename.c_str());
|
||||
|
||||
if (!f.is_open()) {
|
||||
fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
|
||||
LOG_ERR("could not open file %s\n", filename.c_str());
|
||||
return chunks;
|
||||
}
|
||||
|
||||
chunk current_chunk;
|
||||
char buffer[1024];
|
||||
int64_t filepos = 0;
|
||||
std::string current = "";
|
||||
std::string current;
|
||||
while (f.read(buffer, 1024)) {
|
||||
current += std::string(buffer, f.gcount());
|
||||
size_t pos;
|
||||
@ -83,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
// run model
|
||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||
if (llama_decode(ctx, batch) < 0) {
|
||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
||||
LOG_ERR("%s : failed to decode\n", __func__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
@ -98,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
if (embd == NULL) {
|
||||
embd = llama_get_embeddings_ith(ctx, i);
|
||||
if (embd == NULL) {
|
||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
||||
LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -111,29 +114,28 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// For BERT models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
params.embedding = true;
|
||||
|
||||
if (params.chunk_size <= 0) {
|
||||
fprintf(stderr, "chunk_size must be positive\n");
|
||||
LOG_ERR("chunk_size must be positive\n");
|
||||
return 1;
|
||||
}
|
||||
if (params.context_files.empty()) {
|
||||
fprintf(stderr, "context_files must be specified\n");
|
||||
LOG_ERR("context_files must be specified\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
printf("processing files:\n");
|
||||
LOG_INF("processing files:\n");
|
||||
for (auto & context_file : params.context_files) {
|
||||
printf("%s\n", context_file.c_str());
|
||||
LOG_INF("%s\n", context_file.c_str());
|
||||
}
|
||||
|
||||
std::vector<chunk> chunks;
|
||||
@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||
}
|
||||
printf("Number of chunks: %ld\n", chunks.size());
|
||||
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context * ctx = llama_init.context;
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
LOG_ERR("%s: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||
LOG_ERR("%s: pooling type NONE not supported\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
// max batch size
|
||||
@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
|
||||
for (auto & chunk : chunks) {
|
||||
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
||||
if (inp.size() > n_batch) {
|
||||
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||
return 1;
|
||||
}
|
||||
@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
|
||||
// tokenization stats
|
||||
if (params.verbose_prompt) {
|
||||
for (int i = 0; i < (int) chunks.size(); i++) {
|
||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG_INF("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
|
||||
// start loop, receive query and return top k similar chunks based on cosine similarity
|
||||
std::string query;
|
||||
while (true) {
|
||||
printf("Enter query: ");
|
||||
LOG("Enter query: ");
|
||||
std::getline(std::cin, query);
|
||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
||||
|
||||
@ -280,19 +282,19 @@ int main(int argc, char ** argv) {
|
||||
return a.second > b.second;
|
||||
});
|
||||
|
||||
printf("Top %d similar chunks:\n", params.sparams.top_k);
|
||||
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
||||
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
||||
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||
printf("similarity: %f\n", similarities[i].second);
|
||||
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
||||
printf("--------------------\n");
|
||||
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||
LOG("similarity: %f\n", similarities[i].second);
|
||||
LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
||||
LOG("--------------------\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
// clean up
|
||||
llama_batch_free(query_batch);
|
||||
|
@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
rpcb---|TCP|srva
|
||||
rpcb---|TCP|srvb
|
||||
rpcb-.-|TCP|srvn
|
||||
rpcb<-->|TCP|srva
|
||||
rpcb<-->|TCP|srvb
|
||||
rpcb<-.->|TCP|srvn
|
||||
subgraph hostn[Host N]
|
||||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
||||
srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hostb[Host B]
|
||||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
||||
srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hosta[Host A]
|
||||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
||||
srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph host[Main Host]
|
||||
ggml[llama.cpp]---rpcb[RPC backend]
|
||||
local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
|
||||
ggml[llama-cli]<-->rpcb[RPC backend]
|
||||
end
|
||||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||
```
|
||||
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc
|
||||
cd build-rpc
|
||||
cmake .. -DGGML_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
|
||||
Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
```
|
||||
|
||||
This way you can offload model layers to both local and remote devices.
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "The quick brown fox";
|
||||
params.sparams.seed = 1234;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
|
||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||
|
||||
llama_sampler_accept(smpl, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result0 += next_token_str;
|
||||
|
||||
@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
|
||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||
|
||||
llama_sampler_accept(smpl2, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result1 += next_token_str;
|
||||
|
||||
@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
|
||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||
|
||||
llama_sampler_accept(smpl3, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result2 += next_token_str;
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
set(TARGET llama-server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
|
||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
|
||||
system-prompts.js
|
||||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
loading.html
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
@ -45,9 +46,6 @@ endforeach()
|
||||
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
|
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--version` | show version and build info |
|
||||
| `-v, --verbose` | print verbose information |
|
||||
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `--no-display-prompt` | don't print prompt at generation (default: false) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
||||
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
|
||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
|
||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
||||
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
@ -123,10 +118,9 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||
| `--timeout N` | server read/write timeout in seconds (default: 600) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
@ -412,9 +406,44 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
*Options:*
|
||||
|
||||
`content`: Set the text to tokenize.
|
||||
`content`: (Required) The text to tokenize.
|
||||
|
||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||
|
||||
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||
|
||||
**Response:**
|
||||
|
||||
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
|
||||
|
||||
|
||||
If `with_pieces` is `false`:
|
||||
```json
|
||||
{
|
||||
"tokens": [123, 456, 789]
|
||||
}
|
||||
```
|
||||
|
||||
If `with_pieces` is `true`:
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 123, "piece": "Hello"},
|
||||
{"id": 456, "piece": " world"},
|
||||
{"id": 789, "piece": "!"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||
```json
|
||||
{
|
||||
"tokens": [
|
||||
{"id": 198, "piece": [195]}, // hex C3
|
||||
{"id": 164, "piece": [161]} // hex A1
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/detokenize`: Convert tokens to text
|
||||
|
||||
|
@ -40,7 +40,6 @@ server --host localhost --port 8080 \
|
||||
--parallel 8 \
|
||||
--batch-size 512 \
|
||||
--ctx-size 4096 \
|
||||
--log-format text \
|
||||
-ngl 33
|
||||
```
|
||||
|
||||
|
@ -272,7 +272,6 @@ def start_server_background(args):
|
||||
server_args.append('--cont-batching')
|
||||
server_args.append('--metrics')
|
||||
server_args.append('--flash-attn')
|
||||
server_args.extend(['--log-format', "text"])
|
||||
args = [str(arg) for arg in [server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
pkwargs = {
|
||||
|
12
examples/server/public/loading.html
Normal file
12
examples/server/public/loading.html
Normal file
@ -0,0 +1,12 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="refresh" content="5">
|
||||
</head>
|
||||
<body>
|
||||
<div id="loading">
|
||||
The model is loading. Please wait.<br/>
|
||||
The user interface will appear soon.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
File diff suppressed because it is too large
Load Diff
1
examples/server/tests/.gitignore
vendored
Normal file
1
examples/server/tests/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.venv
|
@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
|
||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||
|
||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||
|
@ -105,6 +105,14 @@ Feature: llama.cpp server
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Tokenize with pieces
|
||||
When tokenizing with pieces:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
媽
|
||||
"""
|
||||
Then tokens are given with pieces
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
Then 1 models are supported
|
||||
|
@ -1,3 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
|
||||
context.tokenize_add_special = True
|
||||
|
||||
|
||||
@step("tokenizing with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
context.tokenized_text = context_text(context)
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
|
||||
if getattr(context, "tokenize_add_special", None) is not None:
|
||||
tokenize_args["add_special"] = context.tokenize_add_special
|
||||
|
||||
async with session.post(
|
||||
f"{context.base_url}/tokenize", json=tokenize_args
|
||||
) as response:
|
||||
assert response.status == 200
|
||||
tokenize_json = await response.json()
|
||||
context.tokens_with_pieces = tokenize_json["tokens"]
|
||||
|
||||
|
||||
@step("tokens are given with pieces")
|
||||
@async_run_until_complete
|
||||
async def step_tokenize_with_pieces(context):
|
||||
# Verify that the response contains both token IDs and pieces
|
||||
assert all(
|
||||
"id" in token and "piece" in token for token in context.tokens_with_pieces
|
||||
)
|
||||
|
||||
|
||||
@step('tokenizing')
|
||||
@async_run_until_complete
|
||||
async def step_tokenize(context):
|
||||
@ -991,6 +1020,8 @@ async def oai_chat_completions(user_prompt,
|
||||
event_data = line.split(': ', 1)
|
||||
assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
|
||||
chunk_raw = event_data[1]
|
||||
if chunk_raw == '[DONE]':
|
||||
break
|
||||
|
||||
chunk = json.loads(chunk_raw)
|
||||
assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
|
||||
@ -1341,8 +1372,6 @@ def start_server_background(context):
|
||||
server_args.append('--verbose')
|
||||
if context.lora_file:
|
||||
server_args.extend(['--lora', context.lora_file])
|
||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
||||
server_args.extend(['--log-format', "text"])
|
||||
|
||||
args = [str(arg) for arg in [context.server_path, *server_args]]
|
||||
print(f"bench: starting server with: {' '.join(args)}")
|
||||
|
@ -1,7 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#ifndef NDEBUG
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
@ -15,10 +16,10 @@
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
#include "json.hpp"
|
||||
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <random>
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
|
||||
@ -35,32 +36,6 @@ enum error_type {
|
||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||
};
|
||||
|
||||
extern bool server_verbose;
|
||||
extern bool server_log_json;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
|
||||
#if SERVER_VERBOSE != 1
|
||||
#define LOG_VERBOSE(MSG, ...)
|
||||
#else
|
||||
#define LOG_VERBOSE(MSG, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||
// Fallback null to default value
|
||||
@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
||||
try {
|
||||
return body.at(key);
|
||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
||||
std::stringstream ss;
|
||||
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
||||
LOG_WARNING(ss.str().c_str(), body);
|
||||
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
|
||||
return default_value;
|
||||
}
|
||||
} else {
|
||||
@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
||||
}
|
||||
}
|
||||
|
||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
|
||||
std::stringstream ss_tid;
|
||||
ss_tid << std::this_thread::get_id();
|
||||
json log = json{
|
||||
{"tid", ss_tid.str()},
|
||||
{"timestamp", time(nullptr)},
|
||||
};
|
||||
|
||||
if (server_log_json) {
|
||||
log.merge_patch({
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"msg", message},
|
||||
});
|
||||
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
|
||||
} else {
|
||||
char buf[1024];
|
||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
||||
|
||||
if (!extra.empty()) {
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
std::stringstream ss;
|
||||
ss << buf << " |";
|
||||
for (const auto & el : log.items())
|
||||
{
|
||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
ss << " " << el.key() << "=" << value;
|
||||
}
|
||||
|
||||
const std::string str = ss.str();
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
//
|
||||
// chat template utils
|
||||
//
|
||||
@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||
const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
@ -243,10 +175,7 @@ static std::string random_string() {
|
||||
}
|
||||
|
||||
static std::string gen_chatcmplid() {
|
||||
std::stringstream chatcmplid;
|
||||
chatcmplid << "chatcmpl-" << random_string();
|
||||
|
||||
return chatcmplid.str();
|
||||
return "chatcmpl-" + random_string();
|
||||
}
|
||||
|
||||
//
|
||||
@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
||||
return std::string::npos;
|
||||
}
|
||||
|
||||
static bool json_is_array_of_numbers(json data) {
|
||||
static bool json_is_array_of_numbers(const json & data) {
|
||||
if (data.is_array()) {
|
||||
for (const auto & e : data) {
|
||||
if (!e.is_number()) {
|
||||
@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
|
||||
return out;
|
||||
}
|
||||
|
||||
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
|
||||
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
||||
const std::string str =
|
||||
std::string(event) + ": " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
|
||||
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
LOG_DBG("data stream, to_send: %s", str.c_str());
|
||||
|
||||
return sink.write(str.c_str(), str.size());
|
||||
}
|
||||
@ -425,7 +352,7 @@ static json oaicompat_completion_params_parse(
|
||||
|
||||
// Params supported by OAI but unsupported by llama.cpp
|
||||
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
||||
for (auto & param : unsupported_params) {
|
||||
for (const auto & param : unsupported_params) {
|
||||
if (body.contains(param)) {
|
||||
throw std::runtime_error("Unsupported param: " + param);
|
||||
}
|
||||
@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
|
||||
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
|
||||
bool stopped_word = result.count("stopped_word") != 0;
|
||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||
@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
||||
{"id", completion_id}
|
||||
};
|
||||
|
||||
if (server_verbose) {
|
||||
// extra fields for debugging purposes
|
||||
if (verbose) {
|
||||
res["__verbose"] = result;
|
||||
}
|
||||
|
||||
@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
||||
}
|
||||
|
||||
// return value is vector as there is one case where we might need to generate two responses
|
||||
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
|
||||
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
|
||||
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||
return std::vector<json>({result});
|
||||
}
|
||||
@ -595,7 +523,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
|
||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
||||
json data = json::array();
|
||||
int i = 0;
|
||||
for (auto & elem : embeddings) {
|
||||
for (const auto & elem : embeddings) {
|
||||
data.push_back(json{
|
||||
{"embedding", json_value(elem, "embedding", json::array())},
|
||||
{"index", i++},
|
||||
@ -616,7 +544,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
||||
return res;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||
static bool is_valid_utf8(const std::string & str) {
|
||||
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||
const unsigned char* end = bytes + str.length();
|
||||
|
||||
while (bytes < end) {
|
||||
if (*bytes <= 0x7F) {
|
||||
// 1-byte sequence (0xxxxxxx)
|
||||
bytes++;
|
||||
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 2;
|
||||
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 3;
|
||||
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||
return false;
|
||||
bytes += 4;
|
||||
} else {
|
||||
// Invalid UTF-8 lead byte
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static json format_tokenizer_response(const json & tokens) {
|
||||
return json {
|
||||
{"tokens", tokens}
|
||||
};
|
||||
|
@ -1,15 +1,14 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int, char ** argv) {
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
LOG("\nexample usage:\n");
|
||||
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
@ -18,11 +17,12 @@ int main(int argc, char ** argv) {
|
||||
params.prompt = "Hello my name is";
|
||||
params.n_predict = 32;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
// total length of the sequence including the prompt
|
||||
const int n_predict = params.n_predict;
|
||||
|
||||
@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
||||
|
||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
||||
LOG("\n");
|
||||
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
||||
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
LOG("\n");
|
||||
|
||||
for (auto id : tokens_list) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// create a llama_batch with size 512
|
||||
// we use this object to submit token data for decoding
|
||||
|
||||
@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
|
||||
batch.logits[batch.n_tokens - 1] = true;
|
||||
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
||||
LOG("%s: llama_decode() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -116,18 +115,16 @@ int main(int argc, char ** argv) {
|
||||
while (n_cur <= n_predict) {
|
||||
// sample the next token
|
||||
{
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||
|
||||
llama_sampler_accept(smpl, new_token_id);
|
||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||
fflush(stdout);
|
||||
|
||||
// prepare the next batch
|
||||
@ -143,23 +140,23 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// evaluate the current batch with the transformer model
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG("\n");
|
||||
|
||||
const auto t_main_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
|
||||
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
|
||||
LOG("\n");
|
||||
llama_perf_sampler_print(smpl);
|
||||
llama_perf_context_print(ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
LOG("\n");
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(smpl);
|
||||
|
@ -1,11 +1,16 @@
|
||||
#include "arg.h"
|
||||
#include "common.h"
|
||||
#include "sampling.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <random>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
|
||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||
@ -27,13 +32,14 @@ struct seq_draft {
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
gpt_init();
|
||||
|
||||
if (params.model_draft.empty()) {
|
||||
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -46,12 +52,6 @@ int main(int argc, char ** argv) {
|
||||
std::default_random_engine rng(params.sparams.seed);
|
||||
std::uniform_real_distribution<> u_dist;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("speculative", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@ -80,14 +80,14 @@ int main(int argc, char ** argv) {
|
||||
ctx_dft = llama_init_dft.context;
|
||||
|
||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||
|
||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
||||
LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ int main(int argc, char ** argv) {
|
||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||
) {
|
||||
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -109,8 +109,8 @@ int main(int argc, char ** argv) {
|
||||
: n_vocab_dft - n_vocab_tgt;
|
||||
|
||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||
LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||
return 1;
|
||||
}
|
||||
@ -119,8 +119,8 @@ int main(int argc, char ** argv) {
|
||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
||||
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||
llama_token_to_piece(ctx_dft, i).c_str());
|
||||
return 1;
|
||||
@ -137,18 +137,16 @@ int main(int argc, char ** argv) {
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
for (auto id : inp) {
|
||||
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
||||
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
||||
}
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
const int n_input = inp.size();
|
||||
|
||||
const auto t_enc_start = ggml_time_us();
|
||||
@ -210,7 +208,7 @@ int main(int argc, char ** argv) {
|
||||
active_seqs.insert(s);
|
||||
const auto & tokens = drafts[s].tokens;
|
||||
|
||||
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
||||
LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
|
||||
}
|
||||
|
||||
int i_dft = 0;
|
||||
@ -253,7 +251,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||
LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||
float r = u_dist(rng);
|
||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
||||
|
||||
@ -271,7 +269,7 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
||||
LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
||||
if (r <= p_tgt / p_dft) {
|
||||
s_keep = s;
|
||||
accept = true;
|
||||
@ -279,10 +277,10 @@ int main(int argc, char ** argv) {
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
gpt_sampler_accept(smpl, token_id, true);
|
||||
|
||||
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
break;
|
||||
} else {
|
||||
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||
drafts[s].active = false;
|
||||
|
||||
// calculate residual probability
|
||||
@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
|
||||
if (!accept) {
|
||||
// all drafted tokens were rejected
|
||||
// sample from the target model
|
||||
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||
LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||
std::vector<float> probs(dist_tgt.size);
|
||||
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
||||
probs[i] = dist_tgt.data[i].p;
|
||||
@ -355,13 +353,11 @@ int main(int argc, char ** argv) {
|
||||
// greedy verification
|
||||
|
||||
// sample from the target model
|
||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
gpt_sampler_accept(smpl, token_id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
|
||||
|
||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
@ -370,7 +366,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
||||
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||
|
||||
s_keep = s;
|
||||
accept = true;
|
||||
@ -392,26 +388,24 @@ int main(int argc, char ** argv) {
|
||||
++i_dft;
|
||||
if (params.use_color) {
|
||||
// Color token according to its origin sequence
|
||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||
LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||
} else {
|
||||
printf("%s", token_str.c_str());
|
||||
LOG("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
continue;
|
||||
} else {
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
LOG("%s", token_str.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
||||
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
||||
|
||||
// TODO: simplify
|
||||
{
|
||||
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||
|
||||
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
||||
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||
@ -438,7 +432,7 @@ int main(int argc, char ** argv) {
|
||||
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||
llama_decode(ctx_dft, batch_dft);
|
||||
|
||||
++n_past_dft;
|
||||
@ -485,7 +479,7 @@ int main(int argc, char ** argv) {
|
||||
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
||||
|
||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
||||
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||
}
|
||||
|
||||
@ -494,7 +488,7 @@ int main(int argc, char ** argv) {
|
||||
// attempt to split the branch if the probability is high enough
|
||||
for (int f = 1; f < 8; ++f) {
|
||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
||||
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||
|
||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||
@ -583,7 +577,7 @@ int main(int argc, char ** argv) {
|
||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||
}
|
||||
|
||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||
llama_decode(ctx_tgt, batch_tgt);
|
||||
++n_past_tgt;
|
||||
}
|
||||
@ -601,23 +595,25 @@ int main(int argc, char ** argv) {
|
||||
|
||||
auto t_dec_end = ggml_time_us();
|
||||
|
||||
LOG_TEE("\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("n_draft = %d\n", n_draft);
|
||||
LOG_TEE("n_predict = %d\n", n_predict);
|
||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
||||
LOG_TEE("n_accept = %d\n", n_accept);
|
||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_draft = %d\n", n_draft);
|
||||
LOG_INF("n_predict = %d\n", n_predict);
|
||||
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||
|
||||
LOG_TEE("\ndraft:\n\n");
|
||||
LOG_INF("\n");
|
||||
LOG_INF("draft:\n\n");
|
||||
// TODO: print sampling/grammar timings for all drafts
|
||||
llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
|
||||
llama_perf_context_print(ctx_dft);
|
||||
|
||||
LOG_TEE("\ntarget:\n\n");
|
||||
LOG_INF("\n");
|
||||
LOG_INF("target:\n\n");
|
||||
gpt_perf_print(ctx_tgt, smpl);
|
||||
|
||||
gpt_sampler_free(smpl);
|
||||
@ -636,7 +632,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
fprintf(stderr, "\n\n");
|
||||
LOG("\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -4,33 +4,23 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
GGML_SYCL_SINGLE_GPU=1
|
||||
else
|
||||
GGML_SYCL_DEVICE=0
|
||||
GGML_SYCL_SINGLE_GPU=0
|
||||
fi
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=llama-2-7b.Q4_0.gguf
|
||||
NGL=33
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
@ -1,11 +1,13 @@
|
||||
#include "common.h"
|
||||
//#include "log.h" // TODO: start using log.h
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream> // TODO: remove me
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
@ -13,25 +15,25 @@
|
||||
#include <shellapi.h> // For CommandLineToArgvW
|
||||
#endif
|
||||
|
||||
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||
fprintf(stream, " The possible options are:\n");
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, " -h, --help print this help and exit\n");
|
||||
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
||||
static void print_usage_information(const char * argv0) {
|
||||
printf("usage: %s [options]\n\n", argv0);
|
||||
printf("The tokenize program tokenizes a prompt using a given model,\n");
|
||||
printf("and prints the resulting tokens to standard output.\n\n");
|
||||
printf("It needs a model file, a prompt, and optionally other flags\n");
|
||||
printf("to control the behavior of the tokenizer.\n\n");
|
||||
printf(" The possible options are:\n");
|
||||
printf("\n");
|
||||
printf(" -h, --help print this help and exit\n");
|
||||
printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||
printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||
printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||
printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
printf(" --stdin read prompt from standard input.\n");
|
||||
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
printf(" --no-parse-special do not parse control tokens.\n");
|
||||
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
printf(" --show-count print the total number of tokens.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
const int argc = argv.size();
|
||||
|
||||
if (argc <= 1) {
|
||||
print_usage_information(argv[0].c_str(), stderr);
|
||||
print_usage_information(argv[0].c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
for (; iarg < argc; ++iarg) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
print_usage_information(argv[0].c_str(), stdout);
|
||||
print_usage_information(argv[0].c_str());
|
||||
return 0;
|
||||
}
|
||||
else if (arg == "--ids") {
|
||||
@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
|
||||
// Start actually doing the tokenizing stuff.
|
||||
//////
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
disable_logging = true;
|
||||
#endif
|
||||
|
||||
if (disable_logging) {
|
||||
llama_log_set(llama_log_callback_null, NULL);
|
||||
}
|
||||
|
20
flake.lock
20
flake.lock
@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1725024810,
|
||||
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
|
||||
"lastModified": 1726153070,
|
||||
"narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
|
||||
"rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1724819573,
|
||||
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
|
||||
"lastModified": 1726062873,
|
||||
"narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
|
||||
"rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@ -36,14 +36,14 @@
|
||||
},
|
||||
"nixpkgs-lib": {
|
||||
"locked": {
|
||||
"lastModified": 1722555339,
|
||||
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
|
||||
"lastModified": 1725233747,
|
||||
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
},
|
||||
"original": {
|
||||
"type": "tarball",
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||
}
|
||||
},
|
||||
"root": {
|
||||
|
@ -56,6 +56,15 @@ else()
|
||||
set(GGML_NATIVE_DEFAULT ON)
|
||||
endif()
|
||||
|
||||
# defaults
|
||||
if (NOT GGML_LLAMAFILE_DEFAULT)
|
||||
set(GGML_LLAMAFILE_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
|
||||
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
# general
|
||||
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
|
||||
@ -110,7 +119,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
|
||||
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||
"ggml: BLAS library vendor")
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
|
||||
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
|
||||
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||
@ -127,7 +136,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||
|
||||
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
|
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||
*/
|
||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
*
|
||||
|
@ -358,6 +358,7 @@ extern "C" {
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
struct ggml_cgraph;
|
||||
|
||||
// NOTE: always add types at the end of the enum to keep backward compatibility
|
||||
enum ggml_type {
|
||||
@ -563,10 +564,11 @@ extern "C" {
|
||||
};
|
||||
|
||||
enum ggml_log_level {
|
||||
GGML_LOG_LEVEL_ERROR = 2,
|
||||
GGML_LOG_LEVEL_WARN = 3,
|
||||
GGML_LOG_LEVEL_INFO = 4,
|
||||
GGML_LOG_LEVEL_DEBUG = 5
|
||||
GGML_LOG_LEVEL_NONE = 0,
|
||||
GGML_LOG_LEVEL_INFO = 1,
|
||||
GGML_LOG_LEVEL_WARN = 2,
|
||||
GGML_LOG_LEVEL_ERROR = 3,
|
||||
GGML_LOG_LEVEL_DEBUG = 4,
|
||||
};
|
||||
|
||||
enum ggml_tensor_flag {
|
||||
@ -575,20 +577,6 @@ extern "C" {
|
||||
GGML_TENSOR_FLAG_PARAM = 4,
|
||||
};
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
@ -671,35 +659,6 @@ extern "C" {
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
@ -2017,8 +1976,6 @@ extern "C" {
|
||||
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
|
||||
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
|
||||
|
||||
#define GGML_N_TASKS_MAX -1
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_custom1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@ -2088,7 +2045,6 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
||||
|
||||
@ -2096,11 +2052,17 @@ extern "C" {
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
|
||||
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
|
||||
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
|
||||
|
||||
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
@ -2509,6 +2471,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
@ -26,6 +26,9 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
unset(GGML_EXTRA_LIBS_PRIVATE)
|
||||
unset(GGML_EXTRA_LIBS_PUBLIC)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
@ -35,7 +38,7 @@ if (APPLE AND GGML_ACCELERATE)
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
@ -87,7 +90,7 @@ if (GGML_METAL)
|
||||
COMMENT "Generate assembly for embedded Metal library"
|
||||
)
|
||||
|
||||
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
|
||||
list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
|
||||
else()
|
||||
if (GGML_METAL_SHADER_DEBUG)
|
||||
# custom command to do the following:
|
||||
@ -132,7 +135,7 @@ if (GGML_METAL)
|
||||
)
|
||||
endif() # GGML_METAL_EMBED_LIBRARY
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE
|
||||
${FOUNDATION_LIBRARY}
|
||||
${METAL_FRAMEWORK}
|
||||
${METALKIT_FRAMEWORK}
|
||||
@ -157,11 +160,11 @@ if (GGML_OPENMP)
|
||||
|
||||
add_compile_definitions(GGML_USE_OPENMP)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
|
||||
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "OpenMP not found")
|
||||
@ -244,8 +247,8 @@ if (GGML_BLAS)
|
||||
set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
|
||||
set(GGML_SOURCES_BLAS ggml-blas.cpp)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
@ -326,7 +329,7 @@ if (GGML_CUDA)
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_USE_GRAPHS)
|
||||
if (GGML_CUDA_GRAPHS)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||
endif()
|
||||
|
||||
@ -368,19 +371,19 @@ if (GGML_CUDA)
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -388,9 +391,9 @@ if (GGML_CUDA)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
if (GGML_MUSA)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
|
||||
else()
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
@ -495,7 +498,7 @@ if (GGML_HIPBLAS)
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
|
||||
else()
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||
endif()
|
||||
@ -504,7 +507,7 @@ if (GGML_HIPBLAS)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
@ -513,6 +516,7 @@ if (GGML_SYCL)
|
||||
endif()
|
||||
|
||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||
|
||||
if (DEFINED ENV{ONEAPI_ROOT})
|
||||
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
|
||||
elseif(SUPPORTS_SYCL)
|
||||
@ -551,26 +555,29 @@ if (GGML_SYCL)
|
||||
|
||||
find_package(DNNL)
|
||||
message("-- DNNL found:" ${DNNL_FOUND})
|
||||
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_DNNL=0)
|
||||
endif()
|
||||
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
find_package(MKL REQUIRED)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||
else()
|
||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
||||
endif()
|
||||
endif()
|
||||
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||
list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_RPC)
|
||||
@ -579,7 +586,7 @@ if (GGML_RPC)
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
|
||||
|
||||
if (WIN32)
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
|
||||
@ -657,8 +664,8 @@ if (GGML_VULKAN)
|
||||
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
endif()
|
||||
@ -817,8 +824,8 @@ if (GGML_KOMPUTE)
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute)
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
||||
@ -883,9 +890,10 @@ if (GGML_CANN)
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
|
||||
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${CANN_LIBRARIES} )
|
||||
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
|
||||
set(GGML_EXTRA_LIBDIRS ${GGML_EXTRA_LIBDIRS} ${CANN_INSTALL_DIR}/lib64)
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
|
||||
list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
|
||||
list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64)
|
||||
|
||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
|
||||
endif()
|
||||
else()
|
||||
@ -1328,15 +1336,19 @@ target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
|
||||
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
|
||||
target_compile_features (ggml PRIVATE c_std_11) # don't bump
|
||||
|
||||
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT GGML_SYCL)
|
||||
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
|
||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
|
||||
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cann/aclnn_ops.h"
|
||||
#include "ggml-cann/common.h"
|
||||
@ -1220,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||
return &ggml_backend_cann_buffer_types[device];
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer type.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer type context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Retrieves the name associated with a CANN host buffer.
|
||||
*
|
||||
* This function returns the descriptive name associated with the specified
|
||||
* CANN host buffer context.
|
||||
*
|
||||
* @param buft Pointer to the host buffer context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Free resources associated with a CANN host buffer.
|
||||
*
|
||||
* This function frees the resources associated with a CANN host buffer, including
|
||||
* its context.
|
||||
*
|
||||
* @param buffer The CANN host buffer to free.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified size.
|
||||
*
|
||||
* This function allocates a new CANN host buffer with the given size.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or nullptr if allocation fails.
|
||||
*/
|
||||
static void * ggml_cann_host_malloc(size_t size) {
|
||||
if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void * hostPtr = nullptr;
|
||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
}
|
||||
return hostPtr;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Allocates a new CANN host buffer of the specified type and size.
|
||||
*
|
||||
* @param buft Pointer to the host buffer type context.
|
||||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * hostPtr = ggml_cann_host_malloc(size);
|
||||
|
||||
if (hostPtr == nullptr) {
|
||||
// fallback to cpu buffer
|
||||
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Interface for managing CANN host buffer types in the GGML backend.
|
||||
*
|
||||
* Provides function pointers for allocating, querying properties, and managing
|
||||
* memory for CANN buffer types in the GGML backend.
|
||||
*/
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_cann_buffer_type_host;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Computes the forward operation for a given tensor using CANN
|
||||
* operations.
|
||||
@ -1942,7 +2053,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_cann_set_device(ctx->device);
|
||||
ggml_backend_t cann_backend =
|
||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||
/* .interface = */ ggml_backend_cann_interface,
|
||||
|
614
ggml/src/ggml-cpu-impl.h
Normal file
614
ggml/src/ggml-cpu-impl.h
Normal file
@ -0,0 +1,614 @@
|
||||
#pragma once
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
//#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} ft_union;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-cuda/common.cuh"
|
||||
@ -2552,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||
|
@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
|
||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||
const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
|
||||
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
|
||||
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
|
||||
// Also its fixup needs to allocate a temporary buffer in the memory pool.
|
||||
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
|
||||
const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
|
||||
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
@ -2742,6 +2742,7 @@ struct mmq_args {
|
||||
int64_t ne00; int64_t ne01; int64_t stride01;
|
||||
int64_t ne10; int64_t ne11; int64_t stride11;
|
||||
int64_t ne0;
|
||||
bool use_stream_k;
|
||||
};
|
||||
|
||||
template<ggml_type type>
|
||||
@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
const dim3 block_nums_xy_tiling(nty, ntx, 1);
|
||||
|
||||
const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
|
||||
if (!use_stream_k) {
|
||||
if (!args.use_stream_k) {
|
||||
if (args.ne01 % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
|
||||
|
@ -1,13 +1,15 @@
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||
// For this reason CUB must be included BEFORE anything else.
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
|
||||
#include "sumrows.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
#include <cub/cub.cuh>
|
||||
using namespace cub;
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
|
||||
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
size_t tmp_size = 0;
|
||||
|
39
ggml/src/ggml-cuda/vendors/musa.h
vendored
39
ggml/src/ggml-cuda/vendors/musa.h
vendored
@ -130,42 +130,3 @@
|
||||
#define cudaKernelNodeParams musaKernelNodeParams
|
||||
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
|
||||
#define cudaStreamEndCapture musaStreamEndCapture
|
||||
|
||||
// XXX: Clang builtins mapping
|
||||
#define __vsub4 __vsub4_musa
|
||||
#define __vcmpeq4 __vcmpeq4_musa
|
||||
#define __vcmpne4 __vcmpne4_musa
|
||||
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
|
||||
|
||||
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
|
||||
return __vsubss4(a, b);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
|
||||
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
|
||||
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
|
||||
unsigned int c;
|
||||
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
@ -1,15 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h> // memcpy
|
||||
#include <math.h> // fabsf
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef MIN
|
||||
#undef MAX
|
||||
@ -17,96 +19,6 @@
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
#define m512bh(p) p
|
||||
#define m512i(p) p
|
||||
|
||||
#else
|
||||
|
||||
#define m512bh(p) (__m512bh)(p)
|
||||
#define m512i(p) (__m512i)(p)
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// static_assert should be a #define, but if it's not,
|
||||
// fall back to the _Static_assert C11 keyword.
|
||||
// if C99 - static_assert is noop
|
||||
@ -121,516 +33,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
#define __FMA__
|
||||
#endif
|
||||
#ifndef __F16C__
|
||||
#define __F16C__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
#if defined(__ARM_NEON)
|
||||
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
||||
|
||||
#else
|
||||
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
|
||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
||||
|
||||
#endif // _MSC_VER
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
// 32-bit ARM compatibility
|
||||
|
||||
// vaddlvq_s16
|
||||
// vpaddq_s16
|
||||
// vpaddq_s32
|
||||
// vaddvq_s32
|
||||
// vaddvq_f32
|
||||
// vmaxvq_f32
|
||||
// vcvtnq_s32_f32
|
||||
// vzip1_u8
|
||||
// vzip2_u8
|
||||
|
||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
||||
}
|
||||
|
||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
||||
return vcombine_s16(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
||||
return vcombine_s32(a0, b0);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
||||
int32x4_t res;
|
||||
|
||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[0]; res[1] = b[0];
|
||||
res[2] = a[1]; res[3] = b[1];
|
||||
res[4] = a[2]; res[5] = b[2];
|
||||
res[6] = a[3]; res[7] = b[3];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
uint8x8_t res;
|
||||
|
||||
res[0] = a[4]; res[1] = b[4];
|
||||
res[2] = a[5]; res[3] = b[5];
|
||||
res[4] = a[6]; res[5] = b[6];
|
||||
res[6] = a[7]; res[7] = b[7];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// vld1q_s16_x2
|
||||
// vld1q_u8_x2
|
||||
// vld1q_u8_x4
|
||||
// vld1q_s8_x2
|
||||
// vld1q_s8_x4
|
||||
// TODO: double-check these work correctly
|
||||
|
||||
typedef struct ggml_int16x8x2_t {
|
||||
int16x8_t val[2];
|
||||
} ggml_int16x8x2_t;
|
||||
|
||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
||||
ggml_int16x8x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s16(ptr + 0);
|
||||
res.val[1] = vld1q_s16(ptr + 8);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x2_t {
|
||||
uint8x16_t val[2];
|
||||
} ggml_uint8x16x2_t;
|
||||
|
||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
||||
ggml_uint8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_uint8x16x4_t {
|
||||
uint8x16_t val[4];
|
||||
} ggml_uint8x16x4_t;
|
||||
|
||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
||||
ggml_uint8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_u8(ptr + 0);
|
||||
res.val[1] = vld1q_u8(ptr + 16);
|
||||
res.val[2] = vld1q_u8(ptr + 32);
|
||||
res.val[3] = vld1q_u8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x2_t {
|
||||
int8x16_t val[2];
|
||||
} ggml_int8x16x2_t;
|
||||
|
||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
||||
ggml_int8x16x2_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
typedef struct ggml_int8x16x4_t {
|
||||
int8x16_t val[4];
|
||||
} ggml_int8x16x4_t;
|
||||
|
||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
||||
ggml_int8x16x4_t res;
|
||||
|
||||
res.val[0] = vld1q_s8(ptr + 0);
|
||||
res.val[1] = vld1q_s8(ptr + 16);
|
||||
res.val[2] = vld1q_s8(ptr + 32);
|
||||
res.val[3] = vld1q_s8(ptr + 48);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
||||
int8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// NOTE: not tested
|
||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
||||
uint8x16_t res;
|
||||
|
||||
res[ 0] = a[b[ 0]];
|
||||
res[ 1] = a[b[ 1]];
|
||||
res[ 2] = a[b[ 2]];
|
||||
res[ 3] = a[b[ 3]];
|
||||
res[ 4] = a[b[ 4]];
|
||||
res[ 5] = a[b[ 5]];
|
||||
res[ 6] = a[b[ 6]];
|
||||
res[ 7] = a[b[ 7]];
|
||||
res[ 8] = a[b[ 8]];
|
||||
res[ 9] = a[b[ 9]];
|
||||
res[10] = a[b[10]];
|
||||
res[11] = a[b[11]];
|
||||
res[12] = a[b[12]];
|
||||
res[13] = a[b[13]];
|
||||
res[14] = a[b[14]];
|
||||
res[15] = a[b[15]];
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_int16x8x2_t int16x8x2_t
|
||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
||||
#define ggml_int8x16x2_t int8x16x2_t
|
||||
#define ggml_int8x16x4_t int8x16x4_t
|
||||
|
||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
||||
|
||||
#endif // !defined(__aarch64__)
|
||||
|
||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||
|
||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||
|
||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
#ifdef __POWER9_VECTOR__
|
||||
#include <altivec.h>
|
||||
#undef bool
|
||||
#define bool _Bool
|
||||
#else
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_v_intrinsic
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch64)
|
||||
#if defined(__loongarch_asx)
|
||||
#include <lasxintrin.h>
|
||||
#endif
|
||||
#if defined(__loongarch_sx)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} ft_union;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
ft_union fi_tmpval = {.f = val};
|
||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
||||
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
||||
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
|
||||
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
|
||||
@ -656,6 +62,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
|
||||
#define GGML_HASHSET_FULL ((size_t)-1)
|
||||
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set {
|
||||
size_t size;
|
||||
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||
};
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
|
||||
|
||||
@ -745,6 +157,30 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// computation graph
|
||||
|
||||
enum ggml_cgraph_eval_order {
|
||||
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||
};
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
enum ggml_cgraph_eval_order order;
|
||||
};
|
||||
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-kompute.h"
|
||||
|
@ -1,7 +1,7 @@
|
||||
#import "ggml-metal.h"
|
||||
|
||||
#import "ggml-impl.h"
|
||||
#import "ggml-backend-impl.h"
|
||||
#import "ggml.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
@ -13,13 +13,16 @@
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#ifdef GGML_METAL_NDEBUG
|
||||
#define GGML_METAL_LOG(...)
|
||||
#define GGML_METAL_LOG_INFO(...)
|
||||
#define GGML_METAL_LOG_WARN(...)
|
||||
#define GGML_METAL_LOG_ERROR(...)
|
||||
#else
|
||||
#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
@ -3039,8 +3042,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
if (status != MTLCommandBufferStatusCompleted) {
|
||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||
if (status == MTLCommandBufferStatusError) {
|
||||
NSString * error_code = [command_buffer error].localizedDescription;
|
||||
GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
|
||||
GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
||||
}
|
||||
|
||||
return GGML_STATUS_FAILED;
|
||||
@ -3184,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
|
||||
GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
|
||||
__func__,
|
||||
size_aligned / 1024.0 / 1024.0,
|
||||
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||
@ -3192,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
||||
|
||||
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
||||
@ -3227,13 +3227,17 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
||||
if (ctx->all_data != NULL) {
|
||||
ctx->buffers[0].data = ctx->all_data;
|
||||
ctx->buffers[0].size = size;
|
||||
ctx->buffers[0].metal = nil;
|
||||
|
||||
if (size_aligned > 0) {
|
||||
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||
length:size_aligned
|
||||
options:MTLResourceStorageModeShared
|
||||
deallocator:nil];
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
|
||||
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||
free(ctx);
|
||||
ggml_backend_metal_free_device();
|
||||
@ -3312,13 +3316,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
||||
if (size_aligned <= device.maxBufferLength) {
|
||||
ctx->buffers[ctx->n_buffers].data = data;
|
||||
ctx->buffers[ctx->n_buffers].size = size;
|
||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||
|
||||
if (size_aligned > 0) {
|
||||
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
||||
|
||||
@ -3335,13 +3342,16 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
||||
|
||||
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||
ctx->buffers[ctx->n_buffers].metal = nil;
|
||||
|
||||
if (size_step_aligned > 0) {
|
||||
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
||||
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
|
||||
#include <math.h>
|
||||
@ -230,6 +231,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
// horizontally add 4x4 floats
|
||||
@ -4003,13 +4010,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
const int vector_length = ggml_sve_cnt_b*8;
|
||||
|
||||
// VLA Implementation using switch case
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
// predicate for activating higher lanes for 4 float32 elements
|
||||
const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||
@ -4021,8 +4033,54 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||
const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
|
||||
const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
|
||||
const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
|
||||
const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
|
||||
const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
|
||||
const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
|
||||
const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
|
||||
const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
|
||||
const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||
svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
|
||||
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||
svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
|
||||
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
} break;
|
||||
case 256:
|
||||
{
|
||||
// predicate for activating higher lanes for 16 int8 elements
|
||||
const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
|
||||
// predicate for activating lower lanes for 16 int8 elements
|
||||
const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||
@ -4033,12 +4091,60 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
} break;
|
||||
case 512:
|
||||
{
|
||||
// predicate for activating higher lanes for 32 int8 elements
|
||||
const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
|
||||
|
||||
// predicate for activating higher lanes for 16 int8 elements
|
||||
const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
|
||||
// predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
|
||||
const svbool_t pl16 = svnot_b_z(ph32, ph16);
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
|
||||
const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(ph32, y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(ph32, y1->qs);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
|
||||
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
|
||||
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
|
||||
} break;
|
||||
default:
|
||||
assert(false && "Unsupported vector length");
|
||||
break;
|
||||
}
|
||||
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
@ -4107,37 +4213,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
// Main loop
|
||||
for (; ib < nb; ++ib) {
|
||||
// Compute combined scale for the block
|
||||
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
||||
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
||||
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
||||
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
||||
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
const __m128i off = _mm_set1_epi8(8);
|
||||
|
||||
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
||||
|
||||
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
||||
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
||||
by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
|
||||
bx_0 = _mm_sub_epi8(bx_0, off);
|
||||
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
||||
|
||||
// Convert int32_t to float
|
||||
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
|
||||
|
||||
// Apply the scale, and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
|
||||
const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
|
||||
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
#elif defined(__SSSE3__)
|
||||
// set constants
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
@ -5488,10 +5594,50 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
float sumf = 0;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
if (ggml_sve_cnt_b == QK8_0) {
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
const int vector_length = ggml_sve_cnt_b*8;
|
||||
|
||||
//VLA Implemenation for SVE
|
||||
switch (vector_length) {
|
||||
case 128:
|
||||
{
|
||||
// predicate for activating lanes for 16 Int8 elements
|
||||
const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
|
||||
const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||
|
||||
// load x
|
||||
const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
|
||||
const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
|
||||
const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
|
||||
const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
|
||||
const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
|
||||
const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
|
||||
const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
|
||||
|
||||
sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||
svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
|
||||
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||
svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
|
||||
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
|
||||
} break;
|
||||
case 256:
|
||||
{
|
||||
//printf("sve256");
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||
@ -5506,11 +5652,66 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
} break;
|
||||
case 512:
|
||||
{
|
||||
// predicate for activating high 256 bit
|
||||
const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
|
||||
// predicate for activating low 256 bit
|
||||
const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
|
||||
|
||||
// predicate for activating high lanes for 8 float32 elements
|
||||
const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
|
||||
// predicate for activating low lanes for 8 float32 elements
|
||||
const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
|
||||
|
||||
svfloat32_t sumv00 = svdup_n_f32(0.0f);
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||
|
||||
//load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
|
||||
// and add them to make one 64 element vector
|
||||
// load x
|
||||
const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
|
||||
svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
|
||||
|
||||
qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
|
||||
|
||||
// load y
|
||||
const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
|
||||
svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
|
||||
|
||||
qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
|
||||
|
||||
// scale creation
|
||||
const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
|
||||
const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
|
||||
|
||||
// duplicate deq1 in first half of vector and deq2 in second half of vector
|
||||
const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
|
||||
|
||||
const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
|
||||
|
||||
sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
|
||||
}
|
||||
|
||||
sumf = svaddv_f32(svptrue_b32(), sumv00);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(false && "Unsupported vector length");
|
||||
break;
|
||||
}
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
@ -11625,15 +11826,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
#if defined(__AVX__)
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
||||
const __m256i ax = _mm256_sign_epi8(x, x);
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include "ggml-rpc.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include <cinttypes>
|
||||
@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
|
||||
}
|
||||
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
||||
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
||||
return nullptr;
|
||||
result->buffer = nullptr;
|
||||
}
|
||||
|
||||
if (result->buffer) {
|
||||
// require that the tensor data does not go beyond the buffer end
|
||||
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
|
||||
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
|
||||
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
|
||||
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
|
||||
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
|
||||
}
|
||||
|
||||
result->op = (ggml_op) tensor->op;
|
||||
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
||||
@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
|
||||
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
|
||||
static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
|
@ -33,7 +33,7 @@
|
||||
#include <sycl/half_type.hpp>
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-sycl/backend.hpp"
|
||||
@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_CLAMP:
|
||||
return true;
|
||||
case GGML_OP_CONT:
|
||||
return op->src[0]->type != GGML_TYPE_BF16;
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
case GGML_OP_SOFT_MAX:
|
||||
return true;
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_is_contiguous(op->src[0]);
|
||||
case GGML_OP_IM2COL:
|
||||
// TODO: add support for the new F32 operations
|
||||
return op->src[0]->type == GGML_TYPE_F16;
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
|
@ -21,7 +21,7 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#include "ggml-vulkan-shaders.hpp"
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-aarch64.h"
|
||||
@ -287,6 +288,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
#define GGML_GELU_QUICK_FP16
|
||||
#define GGML_N_TASKS_MAX (-1)
|
||||
|
||||
#define GGML_SOFT_MAX_UNROLL 4
|
||||
#define GGML_VEC_DOT_UNROLL 2
|
||||
@ -1124,17 +1126,17 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
||||
(res) = GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
@ -1165,26 +1167,26 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
||||
res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
@ -1893,6 +1895,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
||||
|
||||
//
|
||||
// ggml object
|
||||
//
|
||||
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
enum ggml_object_type type;
|
||||
|
||||
char padding[4];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
//
|
||||
// ggml context
|
||||
//
|
||||
@ -3381,7 +3400,7 @@ double ggml_type_sizef(enum ggml_type type) {
|
||||
}
|
||||
|
||||
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
||||
return type_traits[type].type_name;
|
||||
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
||||
@ -3847,7 +3866,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||
|
||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||
__func__, cur_end + size_needed, ctx->mem_size);
|
||||
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
@ -19161,6 +19180,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
|
||||
ggml_hash_set_reset(&cgraph->visited_hash_set);
|
||||
}
|
||||
|
||||
int ggml_graph_size(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->size;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
|
||||
if (i < 0) {
|
||||
GGML_ASSERT(cgraph->n_nodes + i >= 0);
|
||||
return cgraph->nodes[cgraph->n_nodes + i];
|
||||
}
|
||||
|
||||
GGML_ASSERT(i < cgraph->n_nodes);
|
||||
return cgraph->nodes[i];
|
||||
}
|
||||
|
||||
struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->nodes;
|
||||
}
|
||||
|
||||
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
|
||||
return cgraph->n_nodes;
|
||||
}
|
||||
|
||||
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(cgraph->size > cgraph->n_nodes);
|
||||
cgraph->nodes[cgraph->n_nodes] = tensor;
|
||||
cgraph->n_nodes++;
|
||||
}
|
||||
|
||||
// Android's libc implementation "bionic" does not support setting affinity
|
||||
#if defined(__gnu_linux__)
|
||||
static void set_numa_thread_affinity(int thread_n) {
|
||||
@ -23242,6 +23289,14 @@ int ggml_cpu_has_arm_fma(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_riscv_v(void) {
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_metal(void) {
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
|
@ -50,6 +50,7 @@
|
||||
|
||||
#include "sgemm.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
@ -235,6 +236,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) {
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
|
||||
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FLOATING POINT MATRIX MULTIPLICATION
|
||||
|
||||
@ -933,6 +942,20 @@ class tinyBLAS_Q0_AVX {
|
||||
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
||||
}
|
||||
|
||||
inline __m256i load(const block_iq4_nl *b) {
|
||||
return MM256_SET_M128I(load1(b), load0(b));
|
||||
}
|
||||
|
||||
inline __m128i load0(const block_iq4_nl *b) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
|
||||
}
|
||||
|
||||
inline __m128i load1(const block_iq4_nl *b) {
|
||||
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||
return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
|
||||
}
|
||||
|
||||
inline __m256 updot(__m256i u, __m256i s) {
|
||||
__m256i res;
|
||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||
@ -1159,6 +1182,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_IQ4_NL: {
|
||||
if (Btype != GGML_TYPE_Q8_0)
|
||||
return false;
|
||||
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||
tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
|
||||
k, (const block_iq4_nl *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
@ -210,6 +210,7 @@ class MODEL_ARCH(IntEnum):
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
MINICPM = auto()
|
||||
MINICPM3 = auto()
|
||||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
@ -219,6 +220,7 @@ class MODEL_ARCH(IntEnum):
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
OLMOE = auto()
|
||||
OPENELM = auto()
|
||||
ARCTIC = auto()
|
||||
DEEPSEEK2 = auto()
|
||||
@ -364,6 +366,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
MODEL_ARCH.MINICPM: "minicpm",
|
||||
MODEL_ARCH.MINICPM3: "minicpm3",
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
@ -373,6 +376,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.OLMOE: "olmoe",
|
||||
MODEL_ARCH.OPENELM: "openelm",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
||||
@ -869,6 +873,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.MINICPM3: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q_A,
|
||||
MODEL_TENSOR.ATTN_Q_B,
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA,
|
||||
MODEL_TENSOR.ATTN_KV_B,
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM,
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GEMMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@ -1010,6 +1031,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.OLMOE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
],
|
||||
MODEL_ARCH.OPENELM: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
@ -13,7 +13,7 @@ class TensorNameMap:
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone
|
||||
"transformer.word_embeddings", # falcon
|
||||
"word_embeddings", # bloom
|
||||
"model.embed_tokens", # llama-hf nemotron
|
||||
"model.embed_tokens", # llama-hf nemotron olmoe
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert nomic-bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
@ -54,7 +54,7 @@ class TensorNameMap:
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
|
||||
"output", # llama-pth bloom internlm2
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
@ -66,7 +66,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon jais exaone
|
||||
"model.norm", # llama-hf baichuan internlm2
|
||||
"model.norm", # llama-hf baichuan internlm2 olmoe
|
||||
"norm", # llama-pth
|
||||
"transformer.norm_f", # mpt dbrx
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
@ -98,7 +98,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"h.{bid}.input_layernorm", # bloom
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf nemotron
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
@ -142,7 +142,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention query
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.attention.wq", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
@ -154,7 +154,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention key
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.attention.wk", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
@ -167,7 +167,7 @@ class TensorNameMap:
|
||||
|
||||
# Attention value
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.attention.wv", # llama-pth
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
@ -185,7 +185,7 @@ class TensorNameMap:
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"h.{bid}.self_attention.dense", # bloom
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
@ -229,7 +229,7 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone
|
||||
"h.{bid}.post_attention_layernorm", # bloom
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
@ -253,7 +253,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
"model.layers.{bid}.mlp.gate", # qwen2moe
|
||||
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
|
||||
"transformer.decoder_layer.{bid}.router", # Grok
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
),
|
||||
@ -295,7 +295,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
@ -327,7 +327,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
@ -367,7 +367,7 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
@ -378,7 +378,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere olmoe
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
@ -387,7 +387,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere olmoe
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
|
@ -343,7 +343,7 @@ extern "C" {
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
//bool no_perf; // whether to measure performance timings, TODO: implement
|
||||
bool no_perf; // whether to measure performance timings
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@ -1056,6 +1056,9 @@ extern "C" {
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
|
||||
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
|
||||
|
||||
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
|
||||
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
|
||||
|
||||
// available samplers:
|
||||
|
||||
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
|
||||
@ -1127,15 +1130,20 @@ extern "C" {
|
||||
int32_t n_logit_bias,
|
||||
const llama_logit_bias * logit_bias);
|
||||
|
||||
// Shorthand for:
|
||||
|
||||
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||
|
||||
/// @details Sample and accept a token from the idx-th output of the last evaluation
|
||||
//
|
||||
// Shorthand for:
|
||||
// const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||
// llama_token_data_array cur_p = { ... init from logits ... };
|
||||
// llama_sampler_apply(smpl, &cur_p);
|
||||
// return cur_p.data[cur_p.selected].id;
|
||||
//
|
||||
// At this point, this is mostly a convenience function.
|
||||
//
|
||||
// auto token = cur_p.data[cur_p.selected].id;
|
||||
// llama_sampler_accept(smpl, token);
|
||||
// return token;
|
||||
// Returns the sampled token
|
||||
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
||||
|
||||
// TODO: extend in the future
|
||||
@ -1168,13 +1176,30 @@ extern "C" {
|
||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
//
|
||||
|
||||
enum llama_perf_type {
|
||||
LLAMA_PERF_TYPE_CONTEXT = 0,
|
||||
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
|
||||
struct llama_perf_context_data {
|
||||
double t_start_ms;
|
||||
double t_load_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
};
|
||||
|
||||
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
|
||||
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
|
||||
struct llama_perf_sampler_data {
|
||||
double t_sample_ms;
|
||||
|
||||
int32_t n_sample;
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
|
||||
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
|
||||
|
||||
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
|
||||
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
|
@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
||||
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user