mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
common : reimplement logging (#9418)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
https://github.com/ggerganov/llama.cpp/pull/9418
This commit is contained in:
parent
e6deac31f7
commit
6262d13e0b
3
.github/workflows/build.yml
vendored
3
.github/workflows/build.yml
vendored
@ -23,6 +23,9 @@ env:
|
|||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GGML_NLOOP: 3
|
GGML_NLOOP: 3
|
||||||
GGML_N_THREADS: 1
|
GGML_N_THREADS: 1
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
macOS-latest-cmake-arm64:
|
macOS-latest-cmake-arm64:
|
||||||
|
6
.github/workflows/server.yml
vendored
6
.github/workflows/server.yml
vendored
@ -20,6 +20,12 @@ on:
|
|||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
|
||||||
|
|
||||||
|
env:
|
||||||
|
LLAMA_LOG_COLORS: 1
|
||||||
|
LLAMA_LOG_PREFIX: 1
|
||||||
|
LLAMA_LOG_TIMESTAMPS: 1
|
||||||
|
LLAMA_LOG_VERBOSITY: 10
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
38
Makefile
38
Makefile
@ -54,6 +54,7 @@ TEST_TARGETS = \
|
|||||||
tests/test-grammar-parser \
|
tests/test-grammar-parser \
|
||||||
tests/test-json-schema-to-grammar \
|
tests/test-json-schema-to-grammar \
|
||||||
tests/test-llama-grammar \
|
tests/test-llama-grammar \
|
||||||
|
tests/test-log \
|
||||||
tests/test-model-load-cancel \
|
tests/test-model-load-cancel \
|
||||||
tests/test-opt \
|
tests/test-opt \
|
||||||
tests/test-quantize-fns \
|
tests/test-quantize-fns \
|
||||||
@ -148,6 +149,14 @@ GGML_NO_METAL := 1
|
|||||||
DEPRECATE_WARNING := 1
|
DEPRECATE_WARNING := 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_DISABLE_LOGS
|
||||||
|
REMOVE_WARNING := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_SERVER_VERBOSE
|
||||||
|
REMOVE_WARNING := 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
endif
|
endif
|
||||||
@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
|
|||||||
MK_LDFLAGS += -fsanitize=undefined -g
|
MK_LDFLAGS += -fsanitize=undefined -g
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_VERBOSE
|
|
||||||
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef LLAMA_SERVER_SSL
|
ifdef LLAMA_SERVER_SSL
|
||||||
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
||||||
MK_LDFLAGS += -lssl -lcrypto
|
MK_LDFLAGS += -lssl -lcrypto
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef LLAMA_DISABLE_LOGS
|
|
||||||
MK_CPPFLAGS += -DLOG_DISABLE_LOGS
|
|
||||||
endif # LLAMA_DISABLE_LOGS
|
|
||||||
|
|
||||||
# warnings
|
# warnings
|
||||||
WARN_FLAGS = \
|
WARN_FLAGS = \
|
||||||
-Wall \
|
-Wall \
|
||||||
@ -931,6 +932,7 @@ OBJ_LLAMA = \
|
|||||||
OBJ_COMMON = \
|
OBJ_COMMON = \
|
||||||
common/common.o \
|
common/common.o \
|
||||||
common/arg.o \
|
common/arg.o \
|
||||||
|
common/log.o \
|
||||||
common/console.o \
|
common/console.o \
|
||||||
common/ngram-cache.o \
|
common/ngram-cache.o \
|
||||||
common/sampling.o \
|
common/sampling.o \
|
||||||
@ -1027,6 +1029,14 @@ $(info - LLAMA_NO_CCACHE)
|
|||||||
$(info )
|
$(info )
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef REMOVE_WARNING
|
||||||
|
$(info !!! REMOVAL WARNING !!!)
|
||||||
|
$(info The following LLAMA_ options have been removed and are no longer supported)
|
||||||
|
$(info - LLAMA_DISABLE_LOGS (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||||
|
$(info - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
|
||||||
|
$(info )
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build libraries
|
# Build libraries
|
||||||
#
|
#
|
||||||
@ -1168,6 +1178,11 @@ common/arg.o: \
|
|||||||
common/arg.h
|
common/arg.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
common/log.o: \
|
||||||
|
common/log.cpp \
|
||||||
|
common/log.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/sampling.o: \
|
common/sampling.o: \
|
||||||
common/sampling.cpp \
|
common/sampling.cpp \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
@ -1346,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
|
||||||
$(OBJ_GGML) $(OBJ_LLAMA)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
@ -1528,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tests/test-log: tests/test-log.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
|
tests/test-grammar-parser: tests/test-grammar-parser.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -737,6 +737,9 @@ function gg_sum_embd_bge_small {
|
|||||||
|
|
||||||
## main
|
## main
|
||||||
|
|
||||||
|
export LLAMA_LOG_PREFIX=1
|
||||||
|
export LLAMA_LOG_TIMESTAMPS=1
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||||
rm -rf ${SRC}/models-mnt
|
rm -rf ${SRC}/models-mnt
|
||||||
|
@ -51,21 +51,23 @@ endif()
|
|||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
|
||||||
add_library(${TARGET} STATIC
|
add_library(${TARGET} STATIC
|
||||||
base64.hpp
|
|
||||||
common.h
|
|
||||||
common.cpp
|
|
||||||
arg.h
|
|
||||||
arg.cpp
|
arg.cpp
|
||||||
sampling.h
|
arg.h
|
||||||
sampling.cpp
|
base64.hpp
|
||||||
console.h
|
common.cpp
|
||||||
|
common.h
|
||||||
console.cpp
|
console.cpp
|
||||||
json.hpp
|
console.h
|
||||||
json-schema-to-grammar.cpp
|
json-schema-to-grammar.cpp
|
||||||
train.h
|
json.hpp
|
||||||
train.cpp
|
log.cpp
|
||||||
ngram-cache.h
|
log.h
|
||||||
ngram-cache.cpp
|
ngram-cache.cpp
|
||||||
|
ngram-cache.h
|
||||||
|
sampling.cpp
|
||||||
|
sampling.h
|
||||||
|
train.cpp
|
||||||
|
train.h
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
115
common/arg.cpp
115
common/arg.cpp
@ -1,15 +1,17 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
|
|
||||||
|
#include "log.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string>
|
#include <climits>
|
||||||
#include <vector>
|
#include <cstdarg>
|
||||||
#include <set>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <cstdarg>
|
#include <set>
|
||||||
#include <climits>
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
|
|
||||||
@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
exit(0);
|
exit(0);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(llama_arg(
|
|
||||||
{"-v", "--verbose"},
|
|
||||||
"print verbose information",
|
|
||||||
[](gpt_params & params) {
|
|
||||||
params.verbosity = 1;
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(llama_arg(
|
|
||||||
{"--verbosity"}, "N",
|
|
||||||
format("set specific verbosity level (default: %d)", params.verbosity),
|
|
||||||
[](gpt_params & params, int value) {
|
|
||||||
params.verbosity = value;
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--verbose-prompt"},
|
{"--verbose-prompt"},
|
||||||
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
||||||
@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
[](gpt_params & params) {
|
[](gpt_params & params) {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"-t", "--threads"}, "N",
|
{"-t", "--threads"}, "N",
|
||||||
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
||||||
@ -876,7 +864,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.input_prefix = value;
|
params.input_prefix = value;
|
||||||
params.enable_chat_template = false;
|
params.enable_chat_template = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--in-suffix"}, "STRING",
|
{"--in-suffix"}, "STRING",
|
||||||
"string to suffix after user inputs with (default: empty)",
|
"string to suffix after user inputs with (default: empty)",
|
||||||
@ -884,7 +872,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.input_suffix = value;
|
params.input_suffix = value;
|
||||||
params.enable_chat_template = false;
|
params.enable_chat_template = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--no-warmup"},
|
{"--no-warmup"},
|
||||||
"skip warming up the model with an empty run",
|
"skip warming up the model with an empty run",
|
||||||
@ -1824,19 +1812,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.system_prompt = system_prompt;
|
params.system_prompt = system_prompt;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(llama_arg(
|
|
||||||
{"--log-format"}, "{text, json}",
|
|
||||||
"log output format: json or text (default: json)",
|
|
||||||
[](gpt_params & params, const std::string & value) {
|
|
||||||
if (value == "json") {
|
|
||||||
params.log_json = true;
|
|
||||||
} else if (value == "text") {
|
|
||||||
params.log_json = false;
|
|
||||||
} else {
|
|
||||||
throw std::invalid_argument("invalid value");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--metrics"},
|
{"--metrics"},
|
||||||
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
||||||
@ -1956,39 +1931,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
else { std::invalid_argument("invalid value"); }
|
else { std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
// TODO: make this looks less weird
|
|
||||||
add_opt(llama_arg(
|
|
||||||
{"--log-test"},
|
|
||||||
"Log test",
|
|
||||||
[](gpt_params &) { log_param_single_parse("--log-test"); }
|
|
||||||
));
|
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--log-disable"},
|
{"--log-disable"},
|
||||||
"Log disable",
|
"Log disable",
|
||||||
[](gpt_params &) { log_param_single_parse("--log-disable"); }
|
[](gpt_params &) {
|
||||||
));
|
gpt_log_pause(gpt_log_main());
|
||||||
add_opt(llama_arg(
|
}
|
||||||
{"--log-enable"},
|
|
||||||
"Log enable",
|
|
||||||
[](gpt_params &) { log_param_single_parse("--log-enable"); }
|
|
||||||
));
|
|
||||||
add_opt(llama_arg(
|
|
||||||
{"--log-new"},
|
|
||||||
"Log new",
|
|
||||||
[](gpt_params &) { log_param_single_parse("--log-new"); }
|
|
||||||
));
|
|
||||||
add_opt(llama_arg(
|
|
||||||
{"--log-append"},
|
|
||||||
"Log append",
|
|
||||||
[](gpt_params &) { log_param_single_parse("--log-append"); }
|
|
||||||
));
|
));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--log-file"}, "FNAME",
|
{"--log-file"}, "FNAME",
|
||||||
"Log file",
|
"Log to file",
|
||||||
[](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
|
[](gpt_params &, const std::string & value) {
|
||||||
|
gpt_log_set_file(gpt_log_main(), value.c_str());
|
||||||
|
}
|
||||||
));
|
));
|
||||||
#endif // LOG_DISABLE_LOGS
|
add_opt(llama_arg(
|
||||||
|
{"--log-colors"},
|
||||||
|
"Enable colored logging",
|
||||||
|
[](gpt_params &) {
|
||||||
|
gpt_log_set_colors(gpt_log_main(), true);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_LOG_COLORS"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"-v", "--verbose", "--log-verbose"},
|
||||||
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
||||||
|
[](gpt_params & params) {
|
||||||
|
params.verbosity = INT_MAX;
|
||||||
|
gpt_log_set_verbosity_thold(INT_MAX);
|
||||||
|
}
|
||||||
|
));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||||
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
||||||
|
[](gpt_params & params, int value) {
|
||||||
|
params.verbosity = value;
|
||||||
|
gpt_log_set_verbosity_thold(value);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--log-prefix"},
|
||||||
|
"Enable prefx in log messages",
|
||||||
|
[](gpt_params &) {
|
||||||
|
gpt_log_set_prefix(gpt_log_main(), true);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_LOG_PREFIX"));
|
||||||
|
add_opt(llama_arg(
|
||||||
|
{"--log-timestamps"},
|
||||||
|
"Enable timestamps in log messages",
|
||||||
|
[](gpt_params &) {
|
||||||
|
gpt_log_set_timestamps(gpt_log_main(), true);
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_LOG_TIMESTAMPS"));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
@ -25,6 +26,7 @@
|
|||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
@ -48,7 +50,6 @@
|
|||||||
#if defined(LLAMA_USE_CURL)
|
#if defined(LLAMA_USE_CURL)
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <curl/easy.h>
|
#include <curl/easy.h>
|
||||||
#include <thread>
|
|
||||||
#include <future>
|
#include <future>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -226,7 +227,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
||||||
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,7 +252,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||||
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -284,14 +285,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
|
|||||||
|
|
||||||
if (n_set && n_set < cpuparams.n_threads) {
|
if (n_set && n_set < cpuparams.n_threads) {
|
||||||
// Not enough set bits, may experience performance issues.
|
// Not enough set bits, may experience performance issues.
|
||||||
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||||
size_t dash_loc = range.find('-');
|
size_t dash_loc = range.find('-');
|
||||||
if (dash_loc == std::string::npos) {
|
if (dash_loc == std::string::npos) {
|
||||||
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -303,7 +304,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
|
|||||||
} else {
|
} else {
|
||||||
start_i = std::stoull(range.substr(0, dash_loc));
|
start_i = std::stoull(range.substr(0, dash_loc));
|
||||||
if (start_i >= GGML_MAX_N_THREADS) {
|
if (start_i >= GGML_MAX_N_THREADS) {
|
||||||
fprintf(stderr, "Start index out of bounds!\n");
|
LOG_ERR("Start index out of bounds!\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -313,7 +314,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
|
|||||||
} else {
|
} else {
|
||||||
end_i = std::stoull(range.substr(dash_loc + 1));
|
end_i = std::stoull(range.substr(dash_loc + 1));
|
||||||
if (end_i >= GGML_MAX_N_THREADS) {
|
if (end_i >= GGML_MAX_N_THREADS) {
|
||||||
fprintf(stderr, "End index out of bounds!\n");
|
LOG_ERR("End index out of bounds!\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -348,7 +349,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|||||||
} else if (c >= 'A' && c <= 'F') {
|
} else if (c >= 'A' && c <= 'F') {
|
||||||
id -= 'A' - 10;
|
id -= 'A' - 10;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -361,6 +362,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gpt_init() {
|
||||||
|
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
|
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
||||||
|
gpt_log_add(gpt_log_main(), level, "%s", text);
|
||||||
|
}
|
||||||
|
}, NULL);
|
||||||
|
|
||||||
|
#ifdef NDEBUG
|
||||||
|
const char * build_type = "";
|
||||||
|
#else
|
||||||
|
const char * build_type = " (debug)";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||||
|
}
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params) {
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
|
|
||||||
@ -441,6 +458,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|||||||
s = std::move(builder);
|
s = std::move(builder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string string_from(bool value) {
|
||||||
|
return value ? "true" : "false";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string string_from(const std::vector<int> & values) {
|
||||||
|
std::stringstream buf;
|
||||||
|
|
||||||
|
buf << "[ ";
|
||||||
|
bool first = true;
|
||||||
|
for (auto e : values) {
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
buf << ", ";
|
||||||
|
}
|
||||||
|
buf << std::to_string(e);
|
||||||
|
}
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
|
||||||
|
std::stringstream buf;
|
||||||
|
|
||||||
|
buf << "[ ";
|
||||||
|
|
||||||
|
bool first = true;
|
||||||
|
for (const auto & token : tokens) {
|
||||||
|
if (!first) {
|
||||||
|
buf << ", ";
|
||||||
|
} else {
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto detokenized = llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
|
detokenized.erase(
|
||||||
|
std::remove_if(
|
||||||
|
detokenized.begin(),
|
||||||
|
detokenized.end(),
|
||||||
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
|
detokenized.end());
|
||||||
|
|
||||||
|
buf << "'" << detokenized << "'"
|
||||||
|
<< ":" << std::to_string(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
|
||||||
|
std::stringstream buf;
|
||||||
|
|
||||||
|
buf << "[ ";
|
||||||
|
|
||||||
|
bool first = true;
|
||||||
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
|
if (!first) {
|
||||||
|
buf << ", ";
|
||||||
|
} else {
|
||||||
|
first = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
||||||
|
|
||||||
|
detokenized.erase(
|
||||||
|
std::remove_if(
|
||||||
|
detokenized.begin(),
|
||||||
|
detokenized.end(),
|
||||||
|
[](const unsigned char c) { return !std::isprint(c); }),
|
||||||
|
detokenized.end());
|
||||||
|
|
||||||
|
buf << "\n" << std::to_string(i)
|
||||||
|
<< ":token '" << detokenized << "'"
|
||||||
|
<< ":pos " << std::to_string(batch.pos[i])
|
||||||
|
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
||||||
|
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
||||||
|
<< ":logits " << std::to_string(batch.logits[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
buf << " ]";
|
||||||
|
|
||||||
|
return buf.str();
|
||||||
|
}
|
||||||
|
|
||||||
void string_process_escapes(std::string & input) {
|
void string_process_escapes(std::string & input) {
|
||||||
std::size_t input_len = input.length();
|
std::size_t input_len = input.length();
|
||||||
std::size_t output_idx = 0;
|
std::size_t output_idx = 0;
|
||||||
@ -481,7 +586,7 @@ void string_process_escapes(std::string & input) {
|
|||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
|
||||||
const char * sep = strchr(data, '=');
|
const char * sep = strchr(data, '=');
|
||||||
if (sep == nullptr || sep - data >= 128) {
|
if (sep == nullptr || sep - data >= 128) {
|
||||||
fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
|
LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
llama_model_kv_override kvo;
|
llama_model_kv_override kvo;
|
||||||
@ -504,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
|||||||
} else if (std::strcmp(sep, "false") == 0) {
|
} else if (std::strcmp(sep, "false") == 0) {
|
||||||
kvo.val_bool = false;
|
kvo.val_bool = false;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (strncmp(sep, "str:", 4) == 0) {
|
} else if (strncmp(sep, "str:", 4) == 0) {
|
||||||
sep += 4;
|
sep += 4;
|
||||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
|
||||||
if (strlen(sep) > 127) {
|
if (strlen(sep) > 127) {
|
||||||
fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
strncpy(kvo.val_str, sep, 127);
|
strncpy(kvo.val_str, sep, 127);
|
||||||
kvo.val_str[127] = '\0';
|
kvo.val_str[127] = '\0';
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
|
LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
overrides.emplace_back(std::move(kvo));
|
overrides.emplace_back(std::move(kvo));
|
||||||
@ -729,7 +834,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -737,7 +842,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||||||
|
|
||||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
@ -773,7 +878,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||||||
loaded_la.scale = la.scale;
|
loaded_la.scale = la.scale;
|
||||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||||
if (loaded_la.adapter == nullptr) {
|
if (loaded_la.adapter == nullptr) {
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return iparams;
|
return iparams;
|
||||||
@ -785,12 +890,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
||||||
fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sparams.ignore_eos = false;
|
params.sparams.ignore_eos = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.warmup) {
|
if (params.warmup) {
|
||||||
LOG("warming up the model with an empty run\n");
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tmp;
|
std::vector<llama_token> tmp;
|
||||||
llama_token bos = llama_token_bos(model);
|
llama_token bos = llama_token_bos(model);
|
||||||
@ -955,7 +1060,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|||||||
int remaining_attempts = max_attempts;
|
int remaining_attempts = max_attempts;
|
||||||
|
|
||||||
while (remaining_attempts > 0) {
|
while (remaining_attempts > 0) {
|
||||||
fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl);
|
CURLcode res = curl_easy_perform(curl);
|
||||||
if (res == CURLE_OK) {
|
if (res == CURLE_OK) {
|
||||||
@ -963,13 +1068,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
|||||||
}
|
}
|
||||||
|
|
||||||
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
|
||||||
fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
|
||||||
|
|
||||||
remaining_attempts--;
|
remaining_attempts--;
|
||||||
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -978,7 +1084,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
// Initialize libcurl
|
// Initialize libcurl
|
||||||
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
||||||
if (!curl) {
|
if (!curl) {
|
||||||
fprintf(stderr, "%s: error initializing libcurl\n", __func__);
|
LOG_ERR("%s: error initializing libcurl\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1019,11 +1125,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
if (metadata_in.good()) {
|
if (metadata_in.good()) {
|
||||||
try {
|
try {
|
||||||
metadata_in >> metadata;
|
metadata_in >> metadata;
|
||||||
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||||
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||||
auto previous_url = metadata.at("url").get<std::string>();
|
auto previous_url = metadata.at("url").get<std::string>();
|
||||||
if (previous_url != url) {
|
if (previous_url != url) {
|
||||||
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1034,12 +1140,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
last_modified = metadata.at("lastModified");
|
last_modified = metadata.at("lastModified");
|
||||||
}
|
}
|
||||||
} catch (const nlohmann::json::exception & e) {
|
} catch (const nlohmann::json::exception & e) {
|
||||||
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||||
@ -1087,26 +1193,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
// HEAD not supported, we don't know if the file has changed
|
// HEAD not supported, we don't know if the file has changed
|
||||||
// force trigger downloading
|
// force trigger downloading
|
||||||
force_download = true;
|
force_download = true;
|
||||||
fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool should_download = !file_exists || force_download;
|
bool should_download = !file_exists || force_download;
|
||||||
if (!should_download) {
|
if (!should_download) {
|
||||||
if (!etag.empty() && etag != headers.etag) {
|
if (!etag.empty() && etag != headers.etag) {
|
||||||
fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
|
||||||
should_download = true;
|
should_download = true;
|
||||||
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
} else if (!last_modified.empty() && last_modified != headers.last_modified) {
|
||||||
fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
|
||||||
should_download = true;
|
should_download = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (should_download) {
|
if (should_download) {
|
||||||
std::string path_temporary = path + ".downloadInProgress";
|
std::string path_temporary = path + ".downloadInProgress";
|
||||||
if (file_exists) {
|
if (file_exists) {
|
||||||
fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
|
||||||
if (remove(path.c_str()) != 0) {
|
if (remove(path.c_str()) != 0) {
|
||||||
fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
|
LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1121,7 +1227,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
|
|
||||||
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
|
||||||
if (!outfile) {
|
if (!outfile) {
|
||||||
fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1152,7 +1258,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
};
|
};
|
||||||
|
|
||||||
// start the download
|
// start the download
|
||||||
fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
|
||||||
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
|
||||||
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
|
||||||
if (!was_perform_successful) {
|
if (!was_perform_successful) {
|
||||||
@ -1162,7 +1268,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
long http_code = 0;
|
long http_code = 0;
|
||||||
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
|
||||||
if (http_code < 200 || http_code >= 400) {
|
if (http_code < 200 || http_code >= 400) {
|
||||||
fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
|
LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1176,10 +1282,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
|||||||
{"lastModified", headers.last_modified}
|
{"lastModified", headers.last_modified}
|
||||||
});
|
});
|
||||||
std::ofstream(metadata_path) << metadata.dump(4);
|
std::ofstream(metadata_path) << metadata.dump(4);
|
||||||
fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
|
||||||
|
|
||||||
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
if (rename(path_temporary.c_str(), path.c_str()) != 0) {
|
||||||
fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1194,7 +1300,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
const struct llama_model_params & params) {
|
const struct llama_model_params & params) {
|
||||||
// Basic validation of the model_url
|
// Basic validation of the model_url
|
||||||
if (!model_url || strlen(model_url) == 0) {
|
if (!model_url || strlen(model_url) == 0) {
|
||||||
fprintf(stderr, "%s: invalid model_url\n", __func__);
|
LOG_ERR("%s: invalid model_url\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1211,7 +1317,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
};
|
};
|
||||||
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
|
||||||
if (!ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1231,14 +1337,12 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
// and extract split URL and PATH prefixes
|
// and extract split URL and PATH prefixes
|
||||||
{
|
{
|
||||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
|
||||||
fprintf(stderr, "\n%s: unexpected model file name: %s"
|
LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
|
||||||
" n_split=%d\n", __func__, path_model, n_split);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
|
||||||
fprintf(stderr, "\n%s: unexpected model url: %s"
|
LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
|
||||||
" n_split=%d\n", __func__, model_url, n_split);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1298,7 +1402,7 @@ struct llama_model * llama_load_model_from_url(
|
|||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
const char * /*hf_token*/,
|
const char * /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1308,7 +1412,7 @@ struct llama_model * llama_load_model_from_hf(
|
|||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
const char * /*hf_token*/,
|
const char * /*hf_token*/,
|
||||||
const struct llama_model_params & /*params*/) {
|
const struct llama_model_params & /*params*/) {
|
||||||
fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1636,13 +1740,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|||||||
};
|
};
|
||||||
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
|
||||||
if (!ctx_gguf) {
|
if (!ctx_gguf) {
|
||||||
fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
if (n_tensors == 0) {
|
if (n_tensors == 0) {
|
||||||
fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; i++) {
|
for (int i = 0; i < n_tensors; i++) {
|
||||||
@ -1660,23 +1764,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (layer_idx < 0) {
|
if (layer_idx < 0) {
|
||||||
fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
} else if (layer_idx == 0) {
|
} else if (layer_idx == 0) {
|
||||||
fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
|
||||||
if (tensor->type != GGML_TYPE_F32) {
|
if (tensor->type != GGML_TYPE_F32) {
|
||||||
fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (ggml_n_dims(tensor) != 1) {
|
if (ggml_n_dims(tensor) != 1) {
|
||||||
fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1684,7 +1788,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|||||||
if (result.n_embd == -1) {
|
if (result.n_embd == -1) {
|
||||||
result.n_embd = ggml_nelements(tensor);
|
result.n_embd = ggml_nelements(tensor);
|
||||||
} else if (ggml_nelements(tensor) != result.n_embd) {
|
} else if (ggml_nelements(tensor) != result.n_embd) {
|
||||||
fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1701,7 +1805,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (result.n_embd == -1) {
|
if (result.n_embd == -1) {
|
||||||
fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
|
||||||
result.data.clear();
|
result.data.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1722,7 +1826,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
|
||||||
fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -1738,7 +1842,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (result.n_embd == -1) {
|
if (result.n_embd == -1) {
|
||||||
fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
|
LOG_ERR("%s: no valid control vector files passed\n", __func__);
|
||||||
result.data.clear();
|
result.data.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,11 +4,9 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
|
||||||
#include "log.h"
|
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
@ -343,6 +341,10 @@ struct gpt_params {
|
|||||||
bool batched_bench_output_jsonl = false;
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// call once at the start of a program if it uses libcommon
|
||||||
|
// initializes the logging system and prints info about the build
|
||||||
|
void gpt_init();
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
@ -378,6 +380,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
|||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
|
std::string string_from(bool value);
|
||||||
|
std::string string_from(const std::vector<int> & values);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||||
|
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
|
401
common/log.cpp
Normal file
401
common/log.cpp
Normal file
@ -0,0 +1,401 @@
|
|||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <mutex>
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity) {
|
||||||
|
gpt_log_verbosity_thold = verbosity;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LOG_COL_DEFAULT "\033[0m"
|
||||||
|
#define LOG_COL_BOLD "\033[1m"
|
||||||
|
#define LOG_COL_RED "\033[31m"
|
||||||
|
#define LOG_COL_GREEN "\033[32m"
|
||||||
|
#define LOG_COL_YELLOW "\033[33m"
|
||||||
|
#define LOG_COL_BLUE "\033[34m"
|
||||||
|
#define LOG_COL_MAGENTA "\033[35m"
|
||||||
|
#define LOG_COL_CYAN "\033[36m"
|
||||||
|
#define LOG_COL_WHITE "\033[37m"
|
||||||
|
|
||||||
|
static int64_t t_us() {
|
||||||
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||||
|
}
|
||||||
|
|
||||||
|
// colors
|
||||||
|
enum gpt_log_col : int {
|
||||||
|
GPT_LOG_COL_DEFAULT = 0,
|
||||||
|
GPT_LOG_COL_BOLD,
|
||||||
|
GPT_LOG_COL_RED,
|
||||||
|
GPT_LOG_COL_GREEN,
|
||||||
|
GPT_LOG_COL_YELLOW,
|
||||||
|
GPT_LOG_COL_BLUE,
|
||||||
|
GPT_LOG_COL_MAGENTA,
|
||||||
|
GPT_LOG_COL_CYAN,
|
||||||
|
GPT_LOG_COL_WHITE,
|
||||||
|
};
|
||||||
|
|
||||||
|
// disable colors by default
|
||||||
|
static std::vector<const char *> g_col = {
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log_entry {
|
||||||
|
enum ggml_log_level level;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
|
||||||
|
int64_t timestamp;
|
||||||
|
|
||||||
|
std::vector<char> msg;
|
||||||
|
|
||||||
|
// signals the worker thread to stop
|
||||||
|
bool is_end;
|
||||||
|
|
||||||
|
void print(FILE * file = nullptr) const {
|
||||||
|
FILE * fcur = file;
|
||||||
|
if (!fcur) {
|
||||||
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
||||||
|
// these messages will still be logged to a file
|
||||||
|
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fcur = stdout;
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE) {
|
||||||
|
fcur = stderr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (level != GGML_LOG_LEVEL_NONE && prefix) {
|
||||||
|
if (timestamp) {
|
||||||
|
// [M.s.ms.us]
|
||||||
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
||||||
|
g_col[GPT_LOG_COL_BLUE],
|
||||||
|
(int) (timestamp / 1000000 / 60),
|
||||||
|
(int) (timestamp / 1000000 % 60),
|
||||||
|
(int) (timestamp / 1000 % 1000),
|
||||||
|
(int) (timestamp % 1000),
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (level) {
|
||||||
|
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
||||||
|
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
||||||
|
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(fcur, "%s", msg.data());
|
||||||
|
|
||||||
|
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
||||||
|
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(fcur);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_log {
|
||||||
|
// default capacity - will be expanded if needed
|
||||||
|
gpt_log() : gpt_log(256) {}
|
||||||
|
|
||||||
|
gpt_log(size_t capacity) {
|
||||||
|
file = nullptr;
|
||||||
|
prefix = false;
|
||||||
|
timestamps = false;
|
||||||
|
running = false;
|
||||||
|
t_start = t_us();
|
||||||
|
|
||||||
|
// initial message size - will be expanded if longer messages arrive
|
||||||
|
entries.resize(capacity);
|
||||||
|
for (auto & entry : entries) {
|
||||||
|
entry.msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = 0;
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
~gpt_log() {
|
||||||
|
pause();
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::mutex mtx;
|
||||||
|
std::thread thrd;
|
||||||
|
std::condition_variable cv;
|
||||||
|
|
||||||
|
FILE * file;
|
||||||
|
|
||||||
|
bool prefix;
|
||||||
|
bool timestamps;
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
int64_t t_start;
|
||||||
|
|
||||||
|
// ring buffer of entries
|
||||||
|
std::vector<gpt_log_entry> entries;
|
||||||
|
size_t head;
|
||||||
|
size_t tail;
|
||||||
|
|
||||||
|
// worker thread copies into this
|
||||||
|
gpt_log_entry cur;
|
||||||
|
|
||||||
|
public:
|
||||||
|
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
// discard messages while the worker thread is paused
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
|
||||||
|
{
|
||||||
|
// cannot use args twice, so make a copy in case we need to expand the buffer
|
||||||
|
va_list args_copy;
|
||||||
|
va_copy(args_copy, args);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// hack for bolding arguments
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
for (int i = 0; fmt[i] != 0; i++) {
|
||||||
|
if (fmt[i] == '%') {
|
||||||
|
ss << LOG_COL_BOLD;
|
||||||
|
while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
|
||||||
|
ss << LOG_COL_DEFAULT;
|
||||||
|
if (fmt[i] == 0) break;
|
||||||
|
}
|
||||||
|
ss << fmt[i];
|
||||||
|
}
|
||||||
|
const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
|
||||||
|
if (n >= entry.msg.size()) {
|
||||||
|
entry.msg.resize(n + 1);
|
||||||
|
vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
entry.level = level;
|
||||||
|
entry.prefix = prefix;
|
||||||
|
entry.timestamp = 0;
|
||||||
|
if (timestamps) {
|
||||||
|
entry.timestamp = t_us() - t_start;
|
||||||
|
}
|
||||||
|
entry.is_end = false;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
if (tail == head) {
|
||||||
|
// expand the buffer
|
||||||
|
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
||||||
|
|
||||||
|
size_t new_tail = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
new_entries[new_tail] = std::move(entries[head]);
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
new_tail = (new_tail + 1);
|
||||||
|
} while (head != tail);
|
||||||
|
|
||||||
|
head = 0;
|
||||||
|
tail = new_tail;
|
||||||
|
|
||||||
|
for (size_t i = tail; i < new_entries.size(); i++) {
|
||||||
|
new_entries[i].msg.resize(256);
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = std::move(new_entries);
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
void resume() {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = true;
|
||||||
|
|
||||||
|
thrd = std::thread([this]() {
|
||||||
|
while (true) {
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(mtx);
|
||||||
|
cv.wait(lock, [this]() { return head != tail; });
|
||||||
|
|
||||||
|
cur = entries[head];
|
||||||
|
|
||||||
|
head = (head + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur.is_end) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur.print(); // stdout and stderr
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
cur.print(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void pause() {
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
if (!running) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
running = false;
|
||||||
|
|
||||||
|
// push an entry to signal the worker thread to stop
|
||||||
|
{
|
||||||
|
auto & entry = entries[tail];
|
||||||
|
entry.is_end = true;
|
||||||
|
|
||||||
|
tail = (tail + 1) % entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
cv.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
thrd.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_file(const char * path) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (file) {
|
||||||
|
fclose(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (path) {
|
||||||
|
file = fopen(path, "w");
|
||||||
|
} else {
|
||||||
|
file = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_colors(bool colors) {
|
||||||
|
pause();
|
||||||
|
|
||||||
|
if (colors) {
|
||||||
|
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
||||||
|
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
||||||
|
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
||||||
|
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
||||||
|
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
||||||
|
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
||||||
|
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
||||||
|
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||||
|
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < g_col.size(); i++) {
|
||||||
|
g_col[i] = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_prefix(bool prefix) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->prefix = prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_timestamps(bool timestamps) {
|
||||||
|
std::lock_guard<std::mutex> lock(mtx);
|
||||||
|
|
||||||
|
this->timestamps = timestamps;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// public API
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init() {
|
||||||
|
return new gpt_log;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_main() {
|
||||||
|
static struct gpt_log log;
|
||||||
|
|
||||||
|
return &log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_pause(struct gpt_log * log) {
|
||||||
|
log->pause();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_resume(struct gpt_log * log) {
|
||||||
|
log->resume();
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_free(struct gpt_log * log) {
|
||||||
|
delete log;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
||||||
|
va_list args;
|
||||||
|
va_start(args, fmt);
|
||||||
|
log->add(level, fmt, args);
|
||||||
|
va_end(args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
||||||
|
log->set_file(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
||||||
|
log->set_colors(colors);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
||||||
|
log->set_prefix(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
||||||
|
log->set_timestamps(timestamps);
|
||||||
|
}
|
786
common/log.h
786
common/log.h
@ -1,724 +1,90 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <chrono>
|
#include "ggml.h" // for ggml_log_level
|
||||||
#include <cstring>
|
|
||||||
#include <sstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <thread>
|
|
||||||
#include <vector>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cinttypes>
|
|
||||||
|
|
||||||
// --------------------------------
|
#ifndef __GNUC__
|
||||||
//
|
# define LOG_ATTRIBUTE_FORMAT(...)
|
||||||
// Basic usage:
|
#elif defined(__MINGW32__)
|
||||||
//
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
// --------
|
|
||||||
//
|
|
||||||
// The LOG() and LOG_TEE() macros are ready to go by default
|
|
||||||
// they do not require any initialization.
|
|
||||||
//
|
|
||||||
// LOGLN() and LOG_TEELN() are variants which automatically
|
|
||||||
// include \n character at the end of the log string.
|
|
||||||
//
|
|
||||||
// LOG() behaves exactly like printf, by default writing to a logfile.
|
|
||||||
// LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
|
|
||||||
//
|
|
||||||
// Default logfile is named
|
|
||||||
// "llama.<threadID>.log"
|
|
||||||
// Default LOG_TEE() secondary output target is
|
|
||||||
// stderr
|
|
||||||
//
|
|
||||||
// Logs can be dynamically disabled or enabled using functions:
|
|
||||||
// log_disable()
|
|
||||||
// and
|
|
||||||
// log_enable()
|
|
||||||
//
|
|
||||||
// A log target can be changed with:
|
|
||||||
// log_set_target( string )
|
|
||||||
// creating and opening, or re-opening a file by string filename
|
|
||||||
// or
|
|
||||||
// log_set_target( FILE* )
|
|
||||||
// allowing to point at stderr, stdout, or any valid FILE* file handler.
|
|
||||||
//
|
|
||||||
// --------
|
|
||||||
//
|
|
||||||
// End of Basic usage.
|
|
||||||
//
|
|
||||||
// --------------------------------
|
|
||||||
|
|
||||||
// Specifies a log target.
|
|
||||||
// default uses log_handler() with "llama.log" log file
|
|
||||||
// this can be changed, by defining LOG_TARGET
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET (a valid FILE*)
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// or it can be simply redirected to stdout or stderr
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET stderr
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// The log target can also be redirected to a different function
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET log_handler_different()
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// FILE* log_handler_different()
|
|
||||||
// {
|
|
||||||
// return stderr;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// or:
|
|
||||||
//
|
|
||||||
// #define LOG_TARGET log_handler_another_one("somelog.log")
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
// FILE* log_handler_another_one(char*filename)
|
|
||||||
// {
|
|
||||||
// static FILE* logfile = nullptr;
|
|
||||||
// (...)
|
|
||||||
// if( !logfile )
|
|
||||||
// {
|
|
||||||
// fopen(...)
|
|
||||||
// }
|
|
||||||
// (...)
|
|
||||||
// return logfile
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
#ifndef LOG_TARGET
|
|
||||||
#define LOG_TARGET log_handler()
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef LOG_TEE_TARGET
|
|
||||||
#define LOG_TEE_TARGET stderr
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Utility for synchronizing log configuration state
|
|
||||||
// since std::optional was introduced only in c++17
|
|
||||||
enum LogTriState
|
|
||||||
{
|
|
||||||
LogTriStateSame,
|
|
||||||
LogTriStateFalse,
|
|
||||||
LogTriStateTrue
|
|
||||||
};
|
|
||||||
|
|
||||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
|
||||||
inline std::string log_get_pid()
|
|
||||||
{
|
|
||||||
static std::string pid;
|
|
||||||
if (pid.empty())
|
|
||||||
{
|
|
||||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
|
||||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
|
||||||
// trying to write to the same log.
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << std::this_thread::get_id();
|
|
||||||
pid = ss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
return pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Utility function for generating log file names with unique id based on thread id.
|
|
||||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
|
||||||
// where the number is a runtime id of the current thread.
|
|
||||||
|
|
||||||
#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
|
|
||||||
{
|
|
||||||
static bool _multilog = false;
|
|
||||||
|
|
||||||
if (multilog != LogTriStateSame)
|
|
||||||
{
|
|
||||||
_multilog = multilog == LogTriStateTrue;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::stringstream buf;
|
|
||||||
|
|
||||||
buf << log_file_basename;
|
|
||||||
if (_multilog)
|
|
||||||
{
|
|
||||||
buf << ".";
|
|
||||||
buf << log_get_pid();
|
|
||||||
}
|
|
||||||
buf << ".";
|
|
||||||
buf << log_file_extension;
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef LOG_DEFAULT_FILE_NAME
|
|
||||||
#define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Utility for turning #define values into string literals
|
|
||||||
// so we can have a define for stderr and
|
|
||||||
// we can print "stderr" instead of literal stderr, etc.
|
|
||||||
#define LOG_STRINGIZE1(s) #s
|
|
||||||
#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
|
|
||||||
|
|
||||||
#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
|
|
||||||
|
|
||||||
// Allows disabling timestamps.
|
|
||||||
// in order to disable, define LOG_NO_TIMESTAMPS
|
|
||||||
// like so:
|
|
||||||
//
|
|
||||||
// #define LOG_NO_TIMESTAMPS
|
|
||||||
// #include "log.h"
|
|
||||||
//
|
|
||||||
#ifndef LOG_NO_TIMESTAMPS
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#else
|
|
||||||
#define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#endif
|
|
||||||
#else
|
#else
|
||||||
#define LOG_TIMESTAMP_FMT "%s"
|
# define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
#define LOG_TIMESTAMP_VAL ,""
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LOG_TEE_TIMESTAMPS
|
#define LOG_DEFAULT_DEBUG 1
|
||||||
#ifndef _MSC_VER
|
#define LOG_DEFAULT_LLAMA 0
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_TIMESTAMP_FMT "%s"
|
|
||||||
#define LOG_TEE_TIMESTAMP_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Allows disabling file/line/function prefix
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
// in order to disable, define LOG_NO_FILE_LINE_FUNCTION
|
// set via gpt_log_set_verbosity()
|
||||||
// like so:
|
extern int gpt_log_verbosity_thold;
|
||||||
|
|
||||||
|
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||||
|
|
||||||
|
// the gpt_log uses an internal worker thread to print/write log messages
|
||||||
|
// when the worker thread is paused, incoming log messages are discarded
|
||||||
|
struct gpt_log;
|
||||||
|
|
||||||
|
struct gpt_log * gpt_log_init();
|
||||||
|
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
|
||||||
|
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
|
||||||
|
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
|
||||||
|
void gpt_log_free (struct gpt_log * log);
|
||||||
|
|
||||||
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
||||||
|
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
|
||||||
|
|
||||||
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
||||||
//
|
//
|
||||||
// #define LOG_NO_FILE_LINE_FUNCTION
|
// regular log output:
|
||||||
// #include "log.h"
|
|
||||||
//
|
//
|
||||||
#ifndef LOG_NO_FILE_LINE_FUNCTION
|
// ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
#ifndef _MSC_VER
|
// llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
#define LOG_FLF_FMT "[%24s:%5d][%24s] "
|
// llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
// llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
#else
|
|
||||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
|
||||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_FLF_FMT "%s"
|
|
||||||
#define LOG_FLF_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef LOG_TEE_FILE_LINE_FUNCTION
|
|
||||||
#ifndef _MSC_VER
|
|
||||||
#define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
|
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
|
||||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_FLF_FMT "%s"
|
|
||||||
#define LOG_TEE_FLF_VAL ,""
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
// USE LOG() INSTEAD
|
|
||||||
//
|
//
|
||||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
// with prefix = true, timestamps = true, the log output will look like this:
|
||||||
#define LOG_IMPL(str, ...) \
|
//
|
||||||
do { \
|
// 0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size = 6695.84 MiB, ( 6695.91 / 21845.34)
|
||||||
if (LOG_TARGET != nullptr) \
|
// 0.00.035.064 I llm_load_tensors: ggml ctx size = 0.27 MiB
|
||||||
{ \
|
// 0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
// 0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
|
||||||
fflush(LOG_TARGET); \
|
//
|
||||||
} \
|
// I - info (stdout, V = 0)
|
||||||
|
// W - warning (stderr, V = 0)
|
||||||
|
// E - error (stderr, V = 0)
|
||||||
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||||
|
//
|
||||||
|
|
||||||
|
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
|
||||||
|
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
|
||||||
|
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
|
||||||
|
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
|
||||||
|
// helper macros for logging
|
||||||
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
//
|
||||||
|
// for example:
|
||||||
|
//
|
||||||
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
||||||
|
//
|
||||||
|
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
|
||||||
|
//
|
||||||
|
|
||||||
|
#define LOG_TMPL(level, verbosity, ...) \
|
||||||
|
do { \
|
||||||
|
if ((verbosity) <= gpt_log_verbosity_thold) { \
|
||||||
|
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
|
||||||
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
#else
|
|
||||||
#define LOG_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
#define LOG(...) LOG_TMPL(GGML_LOG_LEVEL_NONE, 0, __VA_ARGS__)
|
||||||
// USE LOG_TEE() INSTEAD
|
#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
|
||||||
//
|
|
||||||
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
|
|
||||||
#define LOG_TEE_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
|
|
||||||
fflush(LOG_TEE_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#else
|
|
||||||
#define LOG_TEE_IMPL(str, ...) \
|
|
||||||
do { \
|
|
||||||
if (LOG_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TARGET); \
|
|
||||||
} \
|
|
||||||
if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr) \
|
|
||||||
{ \
|
|
||||||
fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
|
|
||||||
fflush(LOG_TEE_TARGET); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// The '\0' as a last argument, is a trick to bypass the silly
|
#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO, 0, __VA_ARGS__)
|
||||||
// "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
|
#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN, 0, __VA_ARGS__)
|
||||||
// so we can have a single macro which can be called just like printf.
|
#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0, __VA_ARGS__)
|
||||||
|
#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
|
||||||
|
|
||||||
// Main LOG macro.
|
#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO, verbosity, __VA_ARGS__)
|
||||||
// behaves like printf, and supports arguments the exact same way.
|
#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN, verbosity, __VA_ARGS__)
|
||||||
//
|
#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
|
||||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
|
||||||
#else
|
|
||||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Main TEE macro.
|
|
||||||
// does the same as LOG
|
|
||||||
// and
|
|
||||||
// simultaneously writes stderr.
|
|
||||||
//
|
|
||||||
// Secondary target can be changed just like LOG_TARGET
|
|
||||||
// by defining LOG_TEE_TARGET
|
|
||||||
//
|
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
|
||||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
|
||||||
#else
|
|
||||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// LOG macro variants with auto endline.
|
|
||||||
#if !defined(_MSC_VER) || defined(__clang__)
|
|
||||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
|
||||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
|
||||||
#else
|
|
||||||
#define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
|
||||||
#define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
|
|
||||||
{
|
|
||||||
static bool _initialized = false;
|
|
||||||
static bool _append = false;
|
|
||||||
static bool _disabled = filename.empty() && target == nullptr;
|
|
||||||
static std::string log_current_filename{filename};
|
|
||||||
static FILE *log_current_target{target};
|
|
||||||
static FILE *logfile = nullptr;
|
|
||||||
|
|
||||||
if (change)
|
|
||||||
{
|
|
||||||
if (append != LogTriStateSame)
|
|
||||||
{
|
|
||||||
_append = append == LogTriStateTrue;
|
|
||||||
return logfile;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (disable == LogTriStateTrue)
|
|
||||||
{
|
|
||||||
// Disable primary target
|
|
||||||
_disabled = true;
|
|
||||||
}
|
|
||||||
// If previously disabled, only enable, and keep previous target
|
|
||||||
else if (disable == LogTriStateFalse)
|
|
||||||
{
|
|
||||||
_disabled = false;
|
|
||||||
}
|
|
||||||
// Otherwise, process the arguments
|
|
||||||
else if (log_current_filename != filename || log_current_target != target)
|
|
||||||
{
|
|
||||||
_initialized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_disabled)
|
|
||||||
{
|
|
||||||
// Log is disabled
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (_initialized)
|
|
||||||
{
|
|
||||||
// with fallback in case something went wrong
|
|
||||||
return logfile ? logfile : stderr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the (re)initialization
|
|
||||||
if (target != nullptr)
|
|
||||||
{
|
|
||||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
|
||||||
{
|
|
||||||
fclose(logfile);
|
|
||||||
}
|
|
||||||
|
|
||||||
log_current_filename = LOG_DEFAULT_FILE_NAME;
|
|
||||||
log_current_target = target;
|
|
||||||
|
|
||||||
logfile = target;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (log_current_filename != filename)
|
|
||||||
{
|
|
||||||
if (logfile != nullptr && logfile != stdout && logfile != stderr)
|
|
||||||
{
|
|
||||||
fclose(logfile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logfile = fopen(filename.c_str(), _append ? "a" : "w");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!logfile)
|
|
||||||
{
|
|
||||||
// Verify whether the file was opened, otherwise fallback to stderr
|
|
||||||
logfile = stderr;
|
|
||||||
|
|
||||||
fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// At this point we let the init flag be to true below, and let the target fallback to stderr
|
|
||||||
// otherwise we would repeatedly fopen() which was already unsuccessful
|
|
||||||
}
|
|
||||||
|
|
||||||
_initialized = true;
|
|
||||||
|
|
||||||
return logfile ? logfile : stderr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
|
|
||||||
{
|
|
||||||
return log_handler1_impl(change, append, disable, filename, target);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Disables logs entirely at runtime.
|
|
||||||
// Makes LOG() and LOG_TEE() produce no output,
|
|
||||||
// until enabled back.
|
|
||||||
#define log_disable() log_disable_impl()
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_disable_impl()
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enables logs at runtime.
|
|
||||||
#define log_enable() log_enable_impl()
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_enable_impl()
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
|
|
||||||
#define log_set_target(target) log_set_target_impl(target)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
|
|
||||||
inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_handler() { return log_handler1_impl(); }
|
|
||||||
|
|
||||||
// Enable or disable creating separate log files for each run.
|
|
||||||
// can ONLY be invoked BEFORE first log use.
|
|
||||||
#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
|
|
||||||
// Enable or disable append mode for log file.
|
|
||||||
// can ONLY be invoked BEFORE first log use.
|
|
||||||
#define log_append(enable) log_append_impl(enable)
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline FILE *log_append_impl(bool enable)
|
|
||||||
{
|
|
||||||
return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void log_test()
|
|
||||||
{
|
|
||||||
log_disable();
|
|
||||||
LOG("01 Hello World to nobody, because logs are disabled!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
|
|
||||||
LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
|
|
||||||
log_set_target(stderr);
|
|
||||||
LOG("04 Hello World to stderr!\n");
|
|
||||||
LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
|
|
||||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
|
||||||
LOG("06 Hello World to default log file!\n");
|
|
||||||
log_set_target(stdout);
|
|
||||||
LOG("07 Hello World to stdout!\n");
|
|
||||||
log_set_target(LOG_DEFAULT_FILE_NAME);
|
|
||||||
LOG("08 Hello World to default log file again!\n");
|
|
||||||
log_disable();
|
|
||||||
LOG("09 Hello World _1_ into the void!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
|
|
||||||
log_disable();
|
|
||||||
log_set_target("llama.anotherlog.log");
|
|
||||||
LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
|
|
||||||
log_enable();
|
|
||||||
LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
|
|
||||||
log_set_target("llama.yetanotherlog.log");
|
|
||||||
LOG("13 Hello World this time in yet new file?\n");
|
|
||||||
log_set_target(log_filename_generator("llama_autonamed", "log"));
|
|
||||||
LOG("14 Hello World in log with generated filename!\n");
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
LOG_TEE("15 Hello msvc TEE without arguments\n");
|
|
||||||
LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOG_TEELN("17 Hello msvc TEELN without arguments\n");
|
|
||||||
LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOG("19 Hello msvc LOG without arguments\n");
|
|
||||||
LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
LOGLN("21 Hello msvc LOGLN without arguments\n");
|
|
||||||
LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool log_param_single_parse(const std::string & param)
|
|
||||||
{
|
|
||||||
if ( param == "--log-test")
|
|
||||||
{
|
|
||||||
log_test();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( param == "--log-disable")
|
|
||||||
{
|
|
||||||
log_disable();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ( param == "--log-enable")
|
|
||||||
{
|
|
||||||
log_enable();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (param == "--log-new")
|
|
||||||
{
|
|
||||||
log_multilog(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (param == "--log-append")
|
|
||||||
{
|
|
||||||
log_append(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
|
|
||||||
{
|
|
||||||
if ( param == "--log-file")
|
|
||||||
{
|
|
||||||
if (!check_but_dont_parse)
|
|
||||||
{
|
|
||||||
log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void log_print_usage()
|
|
||||||
{
|
|
||||||
printf("log options:\n");
|
|
||||||
/* format
|
|
||||||
printf(" -h, --help show this help message and exit\n");*/
|
|
||||||
/* spacing
|
|
||||||
printf("__-param----------------Description\n");*/
|
|
||||||
printf(" --log-test Run simple logging test\n");
|
|
||||||
printf(" --log-disable Disable trace logs\n");
|
|
||||||
printf(" --log-enable Enable trace logs\n");
|
|
||||||
printf(" --log-file Specify a log filename (without extension)\n");
|
|
||||||
printf(" --log-new Create a separate new log file on start. "
|
|
||||||
"Each log file will have unique name: \"<name>.<ID>.log\"\n");
|
|
||||||
printf(" --log-append Don't truncate the old log file.\n");
|
|
||||||
printf("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
|
||||||
inline void log_dump_cmdline_impl(int argc, char **argv)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
for (int i = 0; i < argc; ++i)
|
|
||||||
{
|
|
||||||
if (std::string(argv[i]).find(' ') != std::string::npos)
|
|
||||||
{
|
|
||||||
buf << " \"" << argv[i] <<"\"";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buf << " " << argv[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOGLN("Cmd:%s", buf.str().c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
#define log_tostr(var) log_var_to_string_impl(var).c_str()
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(bool var)
|
|
||||||
{
|
|
||||||
return var ? "true" : "false";
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(std::string var)
|
|
||||||
{
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string log_var_to_string_impl(const std::vector<int> & var)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
bool first = true;
|
|
||||||
for (auto e : var)
|
|
||||||
{
|
|
||||||
if (first)
|
|
||||||
{
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
buf << ", ";
|
|
||||||
}
|
|
||||||
buf << std::to_string(e);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename C, typename T>
|
|
||||||
inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
|
|
||||||
bool first = true;
|
|
||||||
for (const auto & token : tokens)
|
|
||||||
{
|
|
||||||
if (!first) {
|
|
||||||
buf << ", ";
|
|
||||||
} else {
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, token);
|
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf
|
|
||||||
<< "'" << detokenized << "'"
|
|
||||||
<< ":" << std::to_string(token);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename C, typename B>
|
|
||||||
inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
|
|
||||||
{
|
|
||||||
std::stringstream buf;
|
|
||||||
buf << "[ ";
|
|
||||||
|
|
||||||
bool first = true;
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i)
|
|
||||||
{
|
|
||||||
if (!first) {
|
|
||||||
buf << ", ";
|
|
||||||
} else {
|
|
||||||
first = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
|
||||||
|
|
||||||
detokenized.erase(
|
|
||||||
std::remove_if(
|
|
||||||
detokenized.begin(),
|
|
||||||
detokenized.end(),
|
|
||||||
[](const unsigned char c) { return !std::isprint(c); }),
|
|
||||||
detokenized.end());
|
|
||||||
|
|
||||||
buf
|
|
||||||
<< "\n" << std::to_string(i)
|
|
||||||
<< ":token '" << detokenized << "'"
|
|
||||||
<< ":pos " << std::to_string(batch.pos[i])
|
|
||||||
<< ":n_seq_id " << std::to_string(batch.n_seq_id[i])
|
|
||||||
<< ":seq_id " << std::to_string(batch.seq_id[i][0])
|
|
||||||
<< ":logits " << std::to_string(batch.logits[i]);
|
|
||||||
}
|
|
||||||
buf << " ]";
|
|
||||||
|
|
||||||
return buf.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
#undef LOG
|
|
||||||
#define LOG(...) // dummy stub
|
|
||||||
#undef LOGLN
|
|
||||||
#define LOGLN(...) // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_TEE
|
|
||||||
#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
|
||||||
|
|
||||||
#undef LOG_TEELN
|
|
||||||
#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
|
|
||||||
|
|
||||||
#undef LOG_DISABLE
|
|
||||||
#define LOG_DISABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_ENABLE
|
|
||||||
#define LOG_ENABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_ENABLE
|
|
||||||
#define LOG_ENABLE() // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_SET_TARGET
|
|
||||||
#define LOG_SET_TARGET(...) // dummy stub
|
|
||||||
|
|
||||||
#undef LOG_DUMP_CMDLINE
|
|
||||||
#define LOG_DUMP_CMDLINE(...) // dummy stub
|
|
||||||
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
@ -2,8 +2,11 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||||
|
@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
std::string result = "\tlogits ";
|
std::string result = "logits ";
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
#include "train.h"
|
#include "train.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
std::mt19937 gen;
|
std::mt19937 gen;
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -8,9 +9,9 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -20,6 +21,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
int is_pp_shared = params.is_pp_shared;
|
int is_pp_shared = params.is_pp_shared;
|
||||||
|
|
||||||
std::vector<int> n_pp = params.n_pp;
|
std::vector<int> n_pp = params.n_pp;
|
||||||
@ -76,7 +79,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,17 +96,17 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.batched_bench_output_jsonl) {
|
if (!params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
@ -133,7 +136,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,7 +158,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -173,20 +176,20 @@ int main(int argc, char ** argv) {
|
|||||||
const float speed = n_kv / t;
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
if(params.batched_bench_output_jsonl) {
|
if(params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE(
|
LOG(
|
||||||
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||||
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||||
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||||
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
@ -196,7 +199,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -8,9 +9,9 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -23,6 +24,7 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// number of parallel batches
|
// number of parallel batches
|
||||||
int n_parallel = params.n_parallel;
|
int n_parallel = params.n_parallel;
|
||||||
@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
|
|||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// create a llama_batch
|
// create a llama_batch
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
|
||||||
@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
if (llama_encode(ctx, batch)) {
|
if (llama_encode(ctx, batch)) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
|
|||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
|
|||||||
//}
|
//}
|
||||||
|
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// main loop
|
// main loop
|
||||||
@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
|
|||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
i_batch[i] = -1;
|
i_batch[i] = -1;
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
@ -185,8 +185,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// if there is only one stream, we print immediately to stdout
|
// if there is only one stream, we print immediately to stdout
|
||||||
if (n_parallel == 1) {
|
if (n_parallel == 1) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
||||||
@ -208,27 +207,25 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
// evaluate the current batch with the transformer model
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
|
||||||
|
|
||||||
if (n_parallel > 1) {
|
if (n_parallel > 1) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_sampler_print(smpl);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
#include <cinttypes>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
|
|||||||
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
|
||||||
try {
|
try {
|
||||||
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
w->token_embedding_table.resize(p->vocab_size * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
|
|
||||||
w->rms_att_weight.resize(p->n_layers * p->dim);
|
w->rms_att_weight.resize(p->n_layers * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
|
||||||
|
|
||||||
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
w->rms_ffn_weight.resize(p->n_layers * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
|
||||||
|
|
||||||
w->wq.resize(p->n_layers * p->dim * p->dim);
|
w->wq.resize(p->n_layers * p->dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||||
|
|
||||||
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
|
|
||||||
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
|
||||||
|
|
||||||
w->wo.resize(p->n_layers * p->dim * p->dim);
|
w->wo.resize(p->n_layers * p->dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
|
||||||
|
|
||||||
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
|
||||||
|
|
||||||
w->rms_final_weight.resize(p->dim);
|
w->rms_final_weight.resize(p->dim);
|
||||||
LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
|
||||||
|
|
||||||
if (shared_weights) {
|
if (shared_weights) {
|
||||||
w->wcls = {};
|
w->wcls = {};
|
||||||
} else {
|
} else {
|
||||||
w->wcls.resize(p->vocab_size * p->dim);
|
w->wcls.resize(p->vocab_size * p->dim);
|
||||||
LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (std::length_error &) {
|
catch (std::length_error &) {
|
||||||
@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|||||||
fseek(f, 0, SEEK_END);
|
fseek(f, 0, SEEK_END);
|
||||||
auto end = ftell(f);
|
auto end = ftell(f);
|
||||||
if (curr != end) {
|
if (curr != end) {
|
||||||
LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void print_sample_weights(TransformerWeights *w){
|
static void print_sample_weights(TransformerWeights *w){
|
||||||
LOG("----- Quick print of first of the weight vales of all the variables\n");
|
LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
|
||||||
LOG("%f\n", w->token_embedding_table[0]);
|
LOG_INF("%f\n", w->token_embedding_table[0]);
|
||||||
LOG("%f\n", w->rms_att_weight[0]);
|
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||||
LOG("%f\n", w->rms_ffn_weight[0]);
|
LOG_INF("%f\n", w->rms_ffn_weight[0]);
|
||||||
|
|
||||||
LOG("%f\n", w->wq[0]);
|
LOG_INF("%f\n", w->wq[0]);
|
||||||
LOG("%f\n", w->wk[0]);
|
LOG_INF("%f\n", w->wk[0]);
|
||||||
LOG("%f\n", w->wv[0]);
|
LOG_INF("%f\n", w->wv[0]);
|
||||||
LOG("%f\n", w->wo[0]);
|
LOG_INF("%f\n", w->wo[0]);
|
||||||
LOG("%f\n", w->w1[0]);
|
LOG_INF("%f\n", w->w1[0]);
|
||||||
LOG("%f\n", w->w2[0]);
|
LOG_INF("%f\n", w->w2[0]);
|
||||||
LOG("%f\n", w->w3[0]);
|
LOG_INF("%f\n", w->w3[0]);
|
||||||
LOG("%f\n", w->rms_att_weight[0]);
|
LOG_INF("%f\n", w->rms_att_weight[0]);
|
||||||
if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
|
if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@ -318,20 +319,20 @@ struct train_params {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
LOG("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
LOG_INF("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||||
LOG("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
LOG_INF("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||||
LOG("%s: n_embd: %u\n", __func__, params->n_embd);
|
LOG_INF("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||||
LOG("%s: n_mult: %u\n", __func__, params->n_mult);
|
LOG_INF("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||||
LOG("%s: n_head: %u\n", __func__, params->n_head);
|
LOG_INF("%s: n_head: %u\n", __func__, params->n_head);
|
||||||
LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
|
||||||
LOG("%s: n_ff: %u\n", __func__, params->n_ff);
|
LOG_INF("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||||
LOG("%s: n_layer: %u\n", __func__, params->n_layer);
|
LOG_INF("%s: n_layer: %u\n", __func__, params->n_layer);
|
||||||
LOG("%s: n_rot: %u\n", __func__, params->n_rot);
|
LOG_INF("%s: n_rot: %u\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void print_tensor_info(const struct ggml_context * ctx) {
|
static void print_tensor_info(const struct ggml_context * ctx) {
|
||||||
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
LOG("%s: Allocating ", __func__);
|
LOG_INF("%s: Allocating ", __func__);
|
||||||
int64_t total = 1;
|
int64_t total = 1;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (; i < ggml_n_dims(t); ++i) {
|
for (; i < ggml_n_dims(t); ++i) {
|
||||||
@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
|
|||||||
|
|
||||||
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
|
||||||
if (is_ggml_file(filename)) {
|
if (is_ggml_file(filename)) {
|
||||||
LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
|
|||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
} else {
|
} else {
|
||||||
// assume llama2.c vocabulary
|
// assume llama2.c vocabulary
|
||||||
LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
|
||||||
llama_file file(filename, "rb");
|
llama_file file(filename, "rb");
|
||||||
if (!file.fp) {
|
if (!file.fp) {
|
||||||
die_fmt("%s: %s", strerror(errno), filename);
|
die_fmt("%s: %s", strerror(errno), filename);
|
||||||
@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
if (!params_parse(argc, argv, ¶ms)) {
|
if (!params_parse(argc, argv, ¶ms)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
log_set_target(stdout);
|
|
||||||
Config config;
|
Config config;
|
||||||
TransformerWeights weights = {};
|
TransformerWeights weights = {};
|
||||||
{
|
{
|
||||||
LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
|
||||||
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
FILE * file = fopen(params.fn_llama2c_model, "rb");
|
||||||
if (!file) {
|
if (!file) {
|
||||||
LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
// read in the config header
|
// read in the config header
|
||||||
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
if (fread(&config, sizeof(Config), 1, file) != 1) {
|
||||||
LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto shared_weights = config.vocab_size > 0;
|
auto shared_weights = config.vocab_size > 0;
|
||||||
@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
|
|||||||
// read in the Transformer weights
|
// read in the Transformer weights
|
||||||
alloc_weights(&weights, &config, shared_weights);
|
alloc_weights(&weights, &config, shared_weights);
|
||||||
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
|
||||||
LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fclose(file);
|
fclose(file);
|
||||||
@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
|
|||||||
model.name = basename(params.fn_llama2c_model);
|
model.name = basename(params.fn_llama2c_model);
|
||||||
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
|
||||||
|
|
||||||
LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -13,14 +13,15 @@
|
|||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <climits>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <climits>
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
||||||
// encoder-only model
|
// encoder-only model
|
||||||
if (llama_encode(ctx, batch) < 0) {
|
if (llama_encode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to encode\n", __func__);
|
LOG_ERR("%s : failed to encode\n", __func__);
|
||||||
}
|
}
|
||||||
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||||
// decoder-only model
|
// decoder-only model
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
LOG_ERR("%s : failed to decode\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,12 +85,12 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
// For non-causal models, batch size must be equal to ubatch size
|
// For non-causal models, batch size must be equal to ubatch size
|
||||||
params.n_ubatch = params.n_batch;
|
params.n_ubatch = params.n_batch;
|
||||||
|
|
||||||
print_build_info();
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,19 +110,19 @@ int main(int argc, char ** argv) {
|
|||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
|
||||||
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
||||||
fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
@ -136,7 +137,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -147,20 +148,20 @@ int main(int argc, char ** argv) {
|
|||||||
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
||||||
for (auto & inp : inputs) {
|
for (auto & inp : inputs) {
|
||||||
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
||||||
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
||||||
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -211,57 +212,57 @@ int main(int argc, char ** argv) {
|
|||||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
if (params.embd_out.empty()) {
|
if (params.embd_out.empty()) {
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
for (int j = 0; j < n_embd_count; j++) {
|
for (int j = 0; j < n_embd_count; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < std::min(3, n_embd); i++) {
|
for (int i = 0; i < std::min(3, n_embd); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, " ... ");
|
LOG(" ... ");
|
||||||
for (int i = n_embd - 3; i < n_embd; i++) {
|
for (int i = n_embd - 3; i < n_embd; i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
LOG("embedding %d: ", j);
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
if (params.embd_normalize == 0) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
LOG("%6.0f ", emb[j * n_embd + i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
LOG("%9.6f ", emb[j * n_embd + i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// print cosine similarity matrix
|
// print cosine similarity matrix
|
||||||
if (n_prompts > 1) {
|
if (n_prompts > 1) {
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
printf("cosine similarity matrix:\n\n");
|
LOG("cosine similarity matrix:\n\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
LOG("%6.6s ", prompts[i].c_str());
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
LOG("%6.2f ", sim);
|
||||||
}
|
}
|
||||||
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
LOG("%1.10s", prompts[i].c_str());
|
||||||
fprintf(stdout, "\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -270,42 +271,42 @@ int main(int argc, char ** argv) {
|
|||||||
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
const bool notArray = params.embd_out != "array";
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
for (int j = 0;;) { // at least one iteration (one prompt)
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
fprintf(stdout, "[");
|
LOG("[");
|
||||||
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
i++;
|
i++;
|
||||||
if (i < n_embd) fprintf(stdout, ","); else break;
|
if (i < n_embd) LOG(","); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, notArray ? "]\n }" : "]");
|
LOG(notArray ? "]\n }" : "]");
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
LOG(notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
if (params.embd_out == "json+" && n_prompts > 1) {
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
LOG(",\n \"cosineSimilarity\": [\n");
|
||||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
fprintf(stdout, " [");
|
LOG(" [");
|
||||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
fprintf(stdout, "%6.2f", sim);
|
LOG("%6.2f", sim);
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) fprintf(stdout, ", "); else break;
|
if (j < n_embd_count) LOG(", "); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, " ]");
|
LOG(" ]");
|
||||||
i++;
|
i++;
|
||||||
if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
|
if (i < n_embd_count) LOG(",\n"); else break;
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n ]");
|
LOG("\n ]");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (notArray) fprintf(stdout, "\n}\n");
|
if (notArray) LOG("\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <random>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
GGML_ASSERT(n > 0);
|
GGML_ASSERT(n > 0);
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||||
printf(" [\n");
|
LOG(" [\n");
|
||||||
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
||||||
if (i2 == n && ne[2] > 2*n) {
|
if (i2 == n && ne[2] > 2*n) {
|
||||||
printf(" ..., \n");
|
LOG(" ..., \n");
|
||||||
i2 = ne[2] - n;
|
i2 = ne[2] - n;
|
||||||
}
|
}
|
||||||
printf(" [\n");
|
LOG(" [\n");
|
||||||
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
||||||
if (i1 == n && ne[1] > 2*n) {
|
if (i1 == n && ne[1] > 2*n) {
|
||||||
printf(" ..., \n");
|
LOG(" ..., \n");
|
||||||
i1 = ne[1] - n;
|
i1 = ne[1] - n;
|
||||||
}
|
}
|
||||||
printf(" [");
|
LOG(" [");
|
||||||
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
||||||
if (i0 == n && ne[0] > 2*n) {
|
if (i0 == n && ne[0] > 2*n) {
|
||||||
printf("..., ");
|
LOG("..., ");
|
||||||
i0 = ne[0] - n;
|
i0 = ne[0] - n;
|
||||||
}
|
}
|
||||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|||||||
} else {
|
} else {
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
printf("%12.4f", v);
|
LOG("%12.4f", v);
|
||||||
sum += v;
|
sum += v;
|
||||||
if (i0 < ne[0] - 1) printf(", ");
|
if (i0 < ne[0] - 1) LOG(", ");
|
||||||
}
|
}
|
||||||
printf("],\n");
|
LOG("],\n");
|
||||||
}
|
}
|
||||||
printf(" ],\n");
|
LOG(" ],\n");
|
||||||
}
|
}
|
||||||
printf(" ]\n");
|
LOG(" ]\n");
|
||||||
printf(" sum = %f\n", sum);
|
LOG(" sum = %f\n", sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|||||||
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
||||||
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
||||||
src0->name, ggml_ne_string(src0).c_str(),
|
src0->name, ggml_ne_string(src0).c_str(),
|
||||||
src1 ? src1_str : "",
|
src1 ? src1_str : "",
|
||||||
ggml_ne_string(t).c_str());
|
ggml_ne_string(t).c_str());
|
||||||
|
|
||||||
|
|
||||||
// copy the data from the GPU memory if needed
|
// copy the data from the GPU memory if needed
|
||||||
@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
|||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -149,7 +148,7 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
gpt_init();
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OK = run(ctx, params);
|
bool OK = run(ctx, params);
|
||||||
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
g_verbose = (params.verbosity == 1);
|
g_verbose = (params.verbosity > 1);
|
||||||
try {
|
try {
|
||||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||||
ctx.run_merge();
|
ctx.run_merge();
|
||||||
|
@ -158,6 +158,8 @@ int main(int argc, char * argv[]) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -19,12 +20,12 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s \\\n"
|
LOG("\n %s \\\n"
|
||||||
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
|
||||||
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
||||||
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Stats {
|
struct Stats {
|
||||||
@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.counts.resize(src1->ne[0]*n_as, 0);
|
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
if (m_params.verbosity > 1) {
|
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
|
||||||
}
|
|
||||||
// loop over all possible experts, regardless if they are used or not in the batch
|
// loop over all possible experts, regardless if they are used or not in the batch
|
||||||
for (int ex = 0; ex < n_as; ++ex) {
|
for (int ex = 0; ex < n_as; ++ex) {
|
||||||
size_t e_start = ex*src1->ne[0];
|
size_t e_start = ex*src1->ne[0];
|
||||||
@ -151,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.values[e_start + j] += x[j]*x[j];
|
e.values[e_start + j] += x[j]*x[j];
|
||||||
e.counts[e_start + j]++;
|
e.counts[e_start + j]++;
|
||||||
if (!std::isfinite(e.values[e_start + j])) {
|
if (!std::isfinite(e.values[e_start + j])) {
|
||||||
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
LOG("\n");
|
||||||
|
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||||||
e.counts.resize(src1->ne[0], 0);
|
e.counts.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ABORT("fatal error");
|
exit(1); //GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
|
||||||
}
|
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[j] += x[j]*x[j];
|
e.values[j] += x[j]*x[j];
|
||||||
e.counts[j]++;
|
e.counts[j]++;
|
||||||
if (!std::isfinite(e.values[j])) {
|
if (!std::isfinite(e.values[j])) {
|
||||||
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
|
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -239,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros != 0 && is_first) {
|
if (n_zeros != 0 && is_first) {
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
is_first = false;
|
is_first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros == n_all) {
|
if (n_zeros == n_all) {
|
||||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_zeros > 0) {
|
if (n_zeros > 0) {
|
||||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -258,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (to_store.size() < m_stats.size()) {
|
if (to_store.size() < m_stats.size()) {
|
||||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ofstream out(fname, std::ios::binary);
|
std::ofstream out(fname, std::ios::binary);
|
||||||
@ -290,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||||||
out.write(m_params.prompt_file.c_str(), len);
|
out.write(m_params.prompt_file.c_str(), len);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_params.verbosity > 0) {
|
LOGV(1, "\n");
|
||||||
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IMatrixCollector::load_imatrix(const char * fname) {
|
bool IMatrixCollector::load_imatrix(const char * fname) {
|
||||||
std::ifstream in(fname, std::ios::binary);
|
std::ifstream in(fname, std::ios::binary);
|
||||||
if (!in) {
|
if (!in) {
|
||||||
printf("%s: failed to open %s\n",__func__, fname);
|
LOG_ERR("%s: failed to open %s\n",__func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
int n_entries;
|
int n_entries;
|
||||||
in.read((char*)&n_entries, sizeof(n_entries));
|
in.read((char*)&n_entries, sizeof(n_entries));
|
||||||
if (in.fail() || n_entries < 1) {
|
if (in.fail() || n_entries < 1) {
|
||||||
printf("%s: no data in file %s\n", __func__, fname);
|
LOG_ERR("%s: no data in file %s\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < n_entries; ++i) {
|
for (int i = 0; i < n_entries; ++i) {
|
||||||
@ -312,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
std::vector<char> name_as_vec(len+1);
|
std::vector<char> name_as_vec(len+1);
|
||||||
in.read((char *)name_as_vec.data(), len);
|
in.read((char *)name_as_vec.data(), len);
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
name_as_vec[len] = 0;
|
name_as_vec[len] = 0;
|
||||||
@ -323,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
int nval;
|
int nval;
|
||||||
in.read((char *)&nval, sizeof(nval));
|
in.read((char *)&nval, sizeof(nval));
|
||||||
if (in.fail() || nval < 1) {
|
if (in.fail() || nval < 1) {
|
||||||
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -336,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|||||||
std::vector<float> tmp(nval);
|
std::vector<float> tmp(nval);
|
||||||
in.read((char*)tmp.data(), nval*sizeof(float));
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
|
||||||
m_stats = {};
|
m_stats = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -437,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
if (params.i_chunk > 0) {
|
if (params.i_chunk > 0) {
|
||||||
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||||
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
||||||
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
|
||||||
n_ctx);
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -478,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
double nll2 = 0.0;
|
double nll2 = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
@ -514,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -531,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.compute_ppl) {
|
if (params.compute_ppl) {
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||||
count += n_ctx - first - 1;
|
count += n_ctx - first - 1;
|
||||||
|
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (params.compute_ppl) {
|
if (params.compute_ppl) {
|
||||||
nll2 /= count;
|
nll2 /= count;
|
||||||
@ -562,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||||||
nll2 -= nll * nll;
|
nll2 -= nll * nll;
|
||||||
if (nll2 > 0) {
|
if (nll2 > 0) {
|
||||||
nll2 = sqrt(nll2/(count-1));
|
nll2 = sqrt(nll2/(count-1));
|
||||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
} else {
|
} else {
|
||||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
LOG("Unexpected negative standard deviation of log(prob)\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -576,26 +572,27 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
g_collector.set_params(params);
|
g_collector.set_params(params);
|
||||||
|
|
||||||
for (const auto & in_file : params.in_files) {
|
for (const auto & in_file : params.in_files) {
|
||||||
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
||||||
if (!g_collector.load_imatrix(in_file.c_str())) {
|
if (!g_collector.load_imatrix(in_file.c_str())) {
|
||||||
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
|
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.in_files.size() > 1) {
|
if (params.in_files.size() > 1) {
|
||||||
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -614,20 +611,20 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == nullptr || ctx == nullptr) {
|
if (model == nullptr || ctx == nullptr) {
|
||||||
fprintf(stderr, "%s : failed to init\n", __func__);
|
LOG_ERR("%s : failed to init\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, params.n_ctx);
|
__func__, n_ctx_train, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!compute_imatrix(ctx, params)) {
|
if (!compute_imatrix(ctx, params)) {
|
||||||
@ -636,7 +633,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
g_collector.save_imatrix();
|
g_collector.save_imatrix();
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
@ -55,7 +56,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -64,7 +65,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
|
|||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(*g_ctx, *g_smpl);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
@ -110,56 +111,51 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
gpt_init();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
auto & sparams = params.sparams;
|
||||||
log_set_target(log_filename_generator("infill", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.embedding) {
|
if (params.embedding) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
||||||
printf("\n************\n");
|
LOG_ERR("\n************\n");
|
||||||
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_base != 0.0) {
|
if (params.rope_freq_base != 0.0) {
|
||||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_scale != 0.0) {
|
if (params.rope_freq_scale != 0.0) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
LOG_INF("%s: llama backend init\n", __func__);
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -172,34 +168,32 @@ int main(int argc, char ** argv) {
|
|||||||
g_smpl = &smpl;
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
LOG_DBG("n_ctx: %d\n", n_ctx);
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||||
__func__, n_ctx_train, n_ctx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
GGML_ASSERT(!llama_add_eos_token(model));
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
LOG("add_bos: %d\n", add_bos);
|
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
std::vector<llama_token> embd_end;
|
std::vector<llama_token> embd_end;
|
||||||
@ -224,18 +218,19 @@ int main(int argc, char ** argv) {
|
|||||||
embd_inp.push_back(middle_token);
|
embd_inp.push_back(middle_token);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
LOG_DBG("add_bos: %d\n", add_bos);
|
||||||
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
||||||
|
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -244,9 +239,8 @@ int main(int argc, char ** argv) {
|
|||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
@ -254,21 +248,21 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("'\n");
|
LOG("'\n");
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
@ -285,28 +279,30 @@ int main(int argc, char ** argv) {
|
|||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG_INF("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG_TEE("Input prefix with BOS\n");
|
LOG_INF("Input prefix with BOS\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
smpl = gpt_sampler_init(model, sparams);
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
|
|
||||||
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||||
LOG_TEE("\n\n");
|
|
||||||
|
|
||||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
|
LOG("\n");
|
||||||
|
LOG("\n##### Infill mode #####\n\n");
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char *control_message;
|
const char *control_message;
|
||||||
if (params.multiline_input) {
|
if (params.multiline_input) {
|
||||||
@ -317,11 +313,11 @@ int main(int argc, char ** argv) {
|
|||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
" - To return control without starting a new line, end your input with '/'.\n"
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
" - If you want to submit another line, end your input with '\\'.\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("== Running in interactive mode. ==\n");
|
LOG("== Running in interactive mode. ==\n");
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||||
#endif
|
#endif
|
||||||
LOG_TEE( "%s\n", control_message);
|
LOG( "%s\n", control_message);
|
||||||
|
|
||||||
is_interacting = params.interactive_first;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
@ -354,9 +350,8 @@ int main(int argc, char ** argv) {
|
|||||||
embd.resize(max_embd_size);
|
embd.resize(max_embd_size);
|
||||||
|
|
||||||
console::set_display(console::error);
|
console::set_display(console::error);
|
||||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// infinite text generation via context swapping
|
// infinite text generation via context swapping
|
||||||
@ -365,14 +360,14 @@ int main(int argc, char ** argv) {
|
|||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() > n_ctx) {
|
if (n_past + (int) embd.size() > n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_left = n_past - params.n_keep - 1;
|
const int n_left = n_past - params.n_keep - 1;
|
||||||
const int n_discard = n_left/2;
|
const int n_discard = n_left/2;
|
||||||
|
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||||
@ -380,9 +375,9 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
LOG("after swap: n_past = %d\n", n_past);
|
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -394,16 +389,16 @@ int main(int argc, char ** argv) {
|
|||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
|
|
||||||
LOG("n_past = %d\n", n_past);
|
LOG_DBG("n_past = %d\n", n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -415,7 +410,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
gpt_sampler_accept(smpl, id, true);
|
gpt_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -425,10 +420,10 @@ int main(int argc, char ** argv) {
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
// some user input remains from prompt or interaction, forward it to processing
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
|
|
||||||
@ -447,7 +442,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
input_tokens.push_back(id);
|
input_tokens.push_back(id);
|
||||||
@ -456,7 +451,6 @@ int main(int argc, char ** argv) {
|
|||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
// reset color to default if we there is no pending user input
|
// reset color to default if we there is no pending user input
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||||
@ -469,10 +463,9 @@ int main(int argc, char ** argv) {
|
|||||||
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||||
if (is_interacting && !params.interactive_first) {
|
if (is_interacting && !params.interactive_first) {
|
||||||
// print an eot token
|
// print an eot token
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
LOG("\n");
|
||||||
printf("\n");
|
|
||||||
console::set_display(console::user_input);
|
console::set_display(console::user_input);
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -528,35 +521,33 @@ int main(int argc, char ** argv) {
|
|||||||
n_remain = params.n_predict;
|
n_remain = params.n_predict;
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
n_consumed = 0;
|
n_consumed = 0;
|
||||||
// LOG_TEE("took new input\n");
|
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found EOS token\n");
|
LOG_DBG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
console::set_display(console::user_input);
|
console::set_display(console::user_input);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting && !params.interactive) {
|
if (n_past > 0 && is_interacting && !params.interactive) {
|
||||||
LOG("waiting for user input\n");
|
LOG_DBG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG("adding input prefix BOS token\n");
|
LOG_DBG("adding input prefix BOS token\n");
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
buffer += params.input_prefix;
|
buffer += params.input_prefix;
|
||||||
printf("%s", buffer.c_str());
|
LOG("%s", buffer.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string line;
|
std::string line;
|
||||||
@ -574,17 +565,17 @@ int main(int argc, char ** argv) {
|
|||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
buffer += params.input_suffix;
|
buffer += params.input_suffix;
|
||||||
printf("%s", params.input_suffix.c_str());
|
LOG("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("buffer: '%s'\n", buffer.c_str());
|
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
@ -595,9 +586,9 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
LOG("empty line, passing control back\n");
|
LOG_DBG("empty line, passing control back\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
input_echo = false; // do not echo this again
|
input_echo = false; // do not echo this again
|
||||||
@ -624,11 +615,10 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!params.interactive && n_remain <= 0) {
|
if (!params.interactive && n_remain <= 0) {
|
||||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
@ -638,9 +628,5 @@ int main(int argc, char ** argv) {
|
|||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
LOG_TEE("Log end\n");
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,6 @@
|
|||||||
// I'll gradually clean and extend it
|
// I'll gradually clean and extend it
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "log.h"
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
@ -40,6 +39,11 @@
|
|||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
|
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
|
||||||
//#define CLIP_DEBUG_FUNCTIONS
|
//#define CLIP_DEBUG_FUNCTIONS
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||||||
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
||||||
int i = gguf_find_key(ctx, key);
|
int i = gguf_find_key(ctx, key);
|
||||||
if (i == -1) {
|
if (i == -1) {
|
||||||
LOG_TEE("key %s not found in file\n", key);
|
LOG_ERR("key %s not found in file\n", key);
|
||||||
throw std::runtime_error(format("Missing required key: %s", key));
|
throw std::runtime_error(format("Missing required key: %s", key));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|||||||
|
|
||||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||||
size_t tensor_size = ggml_nbytes(tensor);
|
size_t tensor_size = ggml_nbytes(tensor);
|
||||||
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||||
}
|
}
|
||||||
@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|||||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
|||||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -568,7 +572,7 @@ struct clip_ctx {
|
|||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
if (load_image_size == nullptr) {
|
if (load_image_size == nullptr) {
|
||||||
load_image_size = clip_image_size_init();
|
load_image_size = clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
image_size_width = load_image_size->width;
|
image_size_width = load_image_size->width;
|
||||||
image_size_height = load_image_size->height;
|
image_size_height = load_image_size->height;
|
||||||
if (is_inf) {
|
if (is_inf) {
|
||||||
@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||||
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
|
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
|
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
||||||
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
|
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
|
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
// kv
|
// kv
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||||
__func__, n_kv, n_tensors, fname);
|
__func__, n_kv, n_tensors, fname);
|
||||||
{
|
{
|
||||||
std::map<enum ggml_type, uint32_t> n_type;
|
std::map<enum ggml_type, uint32_t> n_type;
|
||||||
@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
n_type[type]++;
|
n_type[type]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||||
for (int i = 0; i < n_kv; i++) {
|
for (int i = 0; i < n_kv; i++) {
|
||||||
const char * name = gguf_get_key(ctx, i);
|
const char * name = gguf_get_key(ctx, i);
|
||||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||||
@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
}
|
}
|
||||||
replace_all(value, "\n", "\\n");
|
replace_all(value, "\n", "\\n");
|
||||||
|
|
||||||
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// print type counts
|
// print type counts
|
||||||
@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
size_t tensor_size = ggml_nbytes(cur);
|
size_t tensor_size = ggml_nbytes(cur);
|
||||||
model_size += tensor_size;
|
model_size += tensor_size;
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
new_clip->backend = ggml_backend_cuda_init(0);
|
||||||
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
|
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
new_clip->backend = ggml_backend_metal_init();
|
new_clip->backend = ggml_backend_metal_init();
|
||||||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
new_clip->backend = ggml_backend_cann_init(0);
|
new_clip->backend = ggml_backend_cann_init(0);
|
||||||
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
new_clip->backend = ggml_backend_vk_init(0);
|
new_clip->backend = ggml_backend_vk_init(0);
|
||||||
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!new_clip->backend) {
|
if (!new_clip->backend) {
|
||||||
new_clip->backend = ggml_backend_cpu_init();
|
new_clip->backend = ggml_backend_cpu_init();
|
||||||
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
|
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// model size and capabilities
|
// model size and capabilities
|
||||||
@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||||
|
|
||||||
if (verbosity >= 1) {
|
if (verbosity >= 1) {
|
||||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||||
|
|
||||||
// load tensors
|
// load tensors
|
||||||
{
|
{
|
||||||
@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
new_clip->ctx_data = ggml_init(params);
|
new_clip->ctx_data = ggml_init(params);
|
||||||
if (!new_clip->ctx_data) {
|
if (!new_clip->ctx_data) {
|
||||||
LOG_TEE("%s: ggml_init() failed\n", __func__);
|
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_TEE("cannot open model file for loading tensors\n");
|
LOG_ERR("cannot open model file for loading tensors\n");
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
fin.seekg(offset, std::ios::beg);
|
fin.seekg(offset, std::ios::beg);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
|
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 2) {
|
if (verbosity >= 2) {
|
||||||
LOG_TEE("\n%s: vision model hparams\n", __func__);
|
LOG_INF("\n%s: vision model hparams\n", __func__);
|
||||||
LOG_TEE("image_size %d\n", hparams.image_size);
|
LOG_INF("image_size %d\n", hparams.image_size);
|
||||||
LOG_TEE("patch_size %d\n", hparams.patch_size);
|
LOG_INF("patch_size %d\n", hparams.patch_size);
|
||||||
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
|
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
||||||
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
|
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||||
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
|
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
||||||
LOG_TEE("v_n_head %d\n", hparams.n_head);
|
LOG_INF("v_n_head %d\n", hparams.n_head);
|
||||||
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
|
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
||||||
LOG_TEE("v_eps %f\n", hparams.eps);
|
LOG_INF("v_eps %f\n", hparams.eps);
|
||||||
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||||
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||||
LOG_TEE("v_image_grid_pinpoints: ");
|
LOG_INF("v_image_grid_pinpoints: ");
|
||||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||||
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
|
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
try {
|
try {
|
||||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
// MobileVLM projection
|
// MobileVLM projection
|
||||||
@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_clip;
|
return new_clip;
|
||||||
@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
|
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_TEE("%s: failed to decode image bytes\n", __func__);
|
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
@ -1754,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
|||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int downscaled_height = static_cast<int>(original_height * scale);
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
int wasted_resolution = (width * height) - effective_resolution;
|
||||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_resolution = wasted_resolution;
|
||||||
@ -1872,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|||||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
std::vector<std::vector<clip_image_u8 *>> images;
|
std::vector<std::vector<clip_image_u8 *>> images;
|
||||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
||||||
images.push_back(std::vector<clip_image_u8 *>());
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
|
||||||
if (multiple <= 1) {
|
if (multiple <= 1) {
|
||||||
@ -1887,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
|||||||
clip_image_u8 * source_image = clip_image_u8_init();
|
clip_image_u8 * source_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||||
images[images.size()-1].push_back(source_image);
|
images[images.size()-1].push_back(source_image);
|
||||||
|
|
||||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||||
|
|
||||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||||
clip_image_u8 * refine_image = clip_image_u8_init();
|
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
// split_to_patches
|
// split_to_patches
|
||||||
int width = refine_image->nx;
|
int width = refine_image->nx;
|
||||||
@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
int idx = 0;
|
int idx = 0;
|
||||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||||
clip_image_f32 * res = clip_image_f32_init();
|
clip_image_f32 * res = clip_image_f32_init();
|
||||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||||
res_imgs->data[idx++] = *res;
|
res_imgs->data[idx++] = *res;
|
||||||
@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto & params = ctx->vision_model.hparams;
|
auto & params = ctx->vision_model.hparams;
|
||||||
@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < patches.size(); i++) {
|
for (size_t i = 0; i < patches.size(); i++) {
|
||||||
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||||
clip_image_u8_free(patches[i]);
|
clip_image_u8_free(patches[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2279,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
|||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
new_type = type;
|
new_type = type;
|
||||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||||
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||||
}
|
}
|
||||||
const size_t n_elms = ggml_nelements(cur);
|
const size_t n_elms = ggml_nelements(cur);
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
f32_data = (float *)conv_buf.data();
|
f32_data = (float *)conv_buf.data();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LOG_TEE("Please use an input file in f32 or f16\n");
|
LOG_ERR("Please use an input file in f32 or f16\n");
|
||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
fout.put(0);
|
fout.put(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
||||||
@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -75,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||||||
size_t img_base64_str_start, img_base64_str_end;
|
size_t img_base64_str_start, img_base64_str_end;
|
||||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||||
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||||||
|
|
||||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
|
LOG_ERR("%s: could not load image from base64 string.\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,9 +115,9 @@ struct llava_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\n example usage:\n");
|
LOG("\n example usage:\n");
|
||||||
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
LOG("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
||||||
@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||||||
auto prompt = params->prompt;
|
auto prompt = params->prompt;
|
||||||
if (prompt_contains_image(prompt)) {
|
if (prompt_contains_image(prompt)) {
|
||||||
if (!params->image.empty()) {
|
if (!params->image.empty()) {
|
||||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
LOG_INF("using base64 encoded image instead of command line image path\n");
|
||||||
}
|
}
|
||||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
LOG_ERR("%s: can't load image from prompt\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||||
system_prompt = prompt.substr(0, image_pos);
|
system_prompt = prompt.substr(0, image_pos);
|
||||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||||
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
|
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
|
|
||||||
// generate the response
|
// generate the response
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
if (!smpl) {
|
if (!smpl) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
printf("%s", tmp);
|
LOG("%s", tmp);
|
||||||
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||||
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||||
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||||
@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||||||
}
|
}
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
|||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return model;
|
return model;
|
||||||
@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
if (ctx_llama == NULL) {
|
||||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
ctx_llava->ctx_llama = ctx_llama;
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
ctx_llava->ctx_clip = ctx_clip;
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
@ -283,27 +278,23 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
gpt_init();
|
||||||
log_set_target(log_filename_generator("llava", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
print_usage(argc, argv);
|
print_usage(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
auto model = llava_init(¶ms);
|
|
||||||
|
auto * model = llava_init(¶ms);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (prompt_contains_image(params.prompt)) {
|
if (prompt_contains_image(params.prompt)) {
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
auto * image_embed = load_image(ctx_llava, ¶ms, "");
|
||||||
|
|
||||||
// process the prompt
|
// process the prompt
|
||||||
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
||||||
@ -314,11 +305,11 @@ int main(int argc, char ** argv) {
|
|||||||
llava_free(ctx_llava);
|
llava_free(ctx_llava);
|
||||||
} else {
|
} else {
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
auto ctx_llava = llava_init_context(¶ms, model);
|
auto * ctx_llava = llava_init_context(¶ms, model);
|
||||||
|
|
||||||
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
auto * image_embed = load_image(ctx_llava, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,23 @@
|
|||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cerrno>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <numeric>
|
|
||||||
|
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
|
||||||
|
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
|
||||||
|
|
||||||
|
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||||
|
#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||||
|
|
||||||
// RGB uint8 image
|
// RGB uint8 image
|
||||||
struct clip_image_u8 {
|
struct clip_image_u8 {
|
||||||
@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
|||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int downscaled_height = static_cast<int>(original_height * scale);
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
int wasted_resolution = (width * height) - effective_resolution;
|
||||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
// LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_resolution = wasted_resolution;
|
||||||
@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
img_res_v.size = 0;
|
img_res_v.size = 0;
|
||||||
img_res_v.data = nullptr;
|
img_res_v.data = nullptr;
|
||||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
LOG_ERR("%s: unable to preprocess image\n", __func__);
|
||||||
delete[] img_res_v.data;
|
delete[] img_res_v.data;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||||
}
|
}
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
int n_img_pos_out = 0;
|
int n_img_pos_out = 0;
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
||||||
@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
load_image_size->width = img->nx;
|
load_image_size->width = img->nx;
|
||||||
load_image_size->height = img->ny;
|
load_image_size->height = img->ny;
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
}
|
}
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||||
// flat / default llava-1.5 type embedding
|
// flat / default llava-1.5 type embedding
|
||||||
@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||||
delete[] img_res_v.data;
|
delete[] img_res_v.data;
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image\n");
|
LOG_ERR("Unable to encode image\n");
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||||
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||||
|
|
||||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||||
|
|
||||||
@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
|
||||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|||||||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||||
if (n_image_embd != n_llama_embd) {
|
if (n_image_embd != n_llama_embd) {
|
||||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
|||||||
}
|
}
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_ERR("Unable to allocate memory for image embeddings\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_img_pos;
|
int n_img_pos;
|
||||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
LOG_ERR("%s: cannot encode image, aborting\n", __func__);
|
||||||
free(image_embd);
|
free(image_embd);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||||||
}
|
}
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
if (llama_decode(ctx_llama, batch)) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
clip_image_u8 * img = clip_image_u8_init();
|
clip_image_u8 * img = clip_image_u8_init();
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
clip_image_u8_free(img);
|
clip_image_u8_free(img);
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_ERR("%s: coulnd't embed the image\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
||||||
auto file = fopen(path, "rb");
|
auto file = fopen(path, "rb");
|
||||||
if (file == NULL) {
|
if (file == NULL) {
|
||||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
LOG_ERR("%s: can't read file %s\n", __func__, path);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
|||||||
|
|
||||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||||
perror("Memory allocation error");
|
perror("Memory allocation error");
|
||||||
fclose(file);
|
fclose(file);
|
||||||
return false;
|
return false;
|
||||||
@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
|||||||
long image_bytes_length;
|
long image_bytes_length;
|
||||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||||
if (!loaded) {
|
if (!loaded) {
|
||||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
LOG_ERR("%s: failed to load %s\n", __func__, image_path);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,9 +7,12 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <iostream> // TODO: remove me
|
||||||
|
|
||||||
struct llava_context {
|
struct llava_context {
|
||||||
struct clip_ctx * ctx_clip = NULL;
|
struct clip_ctx * ctx_clip = NULL;
|
||||||
@ -18,14 +21,8 @@ struct llava_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||||
LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(gpt_params * params) {
|
||||||
@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
|||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return model;
|
return model;
|
||||||
@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||||
if (params->n_ctx < 2048) {
|
if (params->n_ctx < 2048) {
|
||||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||||
LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||||
ctx_params.n_ctx = 2048;
|
ctx_params.n_ctx = 2048;
|
||||||
} else {
|
} else {
|
||||||
ctx_params.n_ctx = params->n_ctx;
|
ctx_params.n_ctx = params->n_ctx;
|
||||||
@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
|||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
if (ctx_llama == NULL) {
|
if (ctx_llama == NULL) {
|
||||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
||||||
|
|
||||||
ctx_llava->ctx_llama = ctx_llama;
|
ctx_llava->ctx_llama = ctx_llama;
|
||||||
ctx_llava->model = model;
|
ctx_llava->model = model;
|
||||||
@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
|
|||||||
if (prompt.empty()) {
|
if (prompt.empty()) {
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
return ctx_clip;
|
return ctx_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||||||
n_eval = n_batch;
|
n_eval = n_batch;
|
||||||
}
|
}
|
||||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
*n_past += n_eval;
|
*n_past += n_eval;
|
||||||
@ -125,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
|
|||||||
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
||||||
|
|
||||||
auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||||
slice_embed->embed = image_embed;
|
slice_embed->embed = image_embed;
|
||||||
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
||||||
@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
system_prompt = "<|im_start|>user\n";
|
system_prompt = "<|im_start|>user\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
||||||
@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
|||||||
}
|
}
|
||||||
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct gpt_sampler * smpl,
|
static const char * sample(struct gpt_sampler * smpl,
|
||||||
@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||||
auto ctx_clip = clip_init_context(params);
|
auto * ctx_clip = clip_init_context(params);
|
||||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embeds) {
|
if (!embeds) {
|
||||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the prompt
|
// process the prompt
|
||||||
if (params->prompt.empty() && params->interactive == false) {
|
if (params->prompt.empty() && params->interactive == false) {
|
||||||
LOG_TEE("prompt should be given or interactive mode should be on");
|
LOG_ERR("prompt should be given or interactive mode should be on");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto model = llava_init(params);
|
auto * model = llava_init(params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
const int64_t t_llava_init_start_us = ggml_time_us();
|
const int64_t t_llava_init_start_us = ggml_time_us();
|
||||||
auto ctx_llava = llava_init_context(params, model);
|
auto * ctx_llava = llava_init_context(params, model);
|
||||||
ctx_llava->ctx_clip = ctx_clip;
|
ctx_llava->ctx_clip = ctx_clip;
|
||||||
const int64_t t_llava_init_end_us = ggml_time_us();
|
const int64_t t_llava_init_end_us = ggml_time_us();
|
||||||
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
||||||
LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
||||||
|
|
||||||
const int64_t t_process_image_start_us = ggml_time_us();
|
const int64_t t_process_image_start_us = ggml_time_us();
|
||||||
process_image(ctx_llava, embeds, params, n_past);
|
process_image(ctx_llava, embeds, params, n_past);
|
||||||
const int64_t t_process_image_end_us = ggml_time_us();
|
const int64_t t_process_image_end_us = ggml_time_us();
|
||||||
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
||||||
LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
||||||
|
|
||||||
llava_image_embed_free(embeds);
|
llava_image_embed_free(embeds);
|
||||||
return ctx_llava;
|
return ctx_llava;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
|
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||||
std::string user_prompt = prompt;
|
std::string user_prompt = prompt;
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||||
if (!is_first) {
|
if (!is_first) {
|
||||||
@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
|
|||||||
|
|
||||||
// generate the response
|
// generate the response
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
||||||
return smpl;
|
return smpl;
|
||||||
@ -259,12 +256,7 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
gpt_init();
|
||||||
log_set_target(log_filename_generator("llava", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty())) {
|
if (params.mmproj.empty() || (params.image.empty())) {
|
||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
@ -273,21 +265,23 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
auto ctx_llava = minicpmv_init(¶ms, image, n_past);
|
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
||||||
|
|
||||||
if (!params.prompt.empty()) {
|
if (!params.prompt.empty()) {
|
||||||
LOG_TEE("<user>%s\n", params.prompt.c_str());
|
LOG("<user>%s\n", params.prompt.c_str());
|
||||||
LOG_TEE("<assistant>");
|
LOG("<assistant>");
|
||||||
auto smpl = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true);
|
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response;
|
||||||
bool have_tmp = false;
|
bool have_tmp = false;
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0){
|
if (strcmp(tmp, "</s>") == 0){
|
||||||
if(!have_tmp)continue;
|
if (!have_tmp) {
|
||||||
else break;
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
have_tmp = true;
|
have_tmp = true;
|
||||||
@ -299,15 +293,15 @@ int main(int argc, char ** argv) {
|
|||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
}else {
|
}else {
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG_TEE("<user>");
|
LOG("<user>");
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::getline(std::cin, prompt);
|
std::getline(std::cin, prompt);
|
||||||
LOG_TEE("<assistant>");
|
LOG("<assistant>");
|
||||||
auto smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
||||||
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
||||||
std::string response = "";
|
std::string response;
|
||||||
for (int i = 0; i < max_tgt_len; i++) {
|
for (int i = 0; i < max_tgt_len; i++) {
|
||||||
auto tmp = llama_loop(ctx_llava, smpl, n_past);
|
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
@ -42,18 +43,14 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int W = 15; // lookahead window
|
const int W = 15; // lookahead window
|
||||||
const int N = 5; // n-gram size
|
const int N = 5; // n-gram size
|
||||||
const int G = 15; // max verification n-grams
|
const int G = 15; // max verification n-grams
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("lookahead", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
|
|||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
@ -166,7 +163,7 @@ int main(int argc, char ** argv) {
|
|||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,10 +290,10 @@ int main(int argc, char ** argv) {
|
|||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
} else {
|
} else {
|
||||||
// print light cyan
|
// print light cyan
|
||||||
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
LOG("\033[0;96m%s\033[0m", token_str.c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
|
|||||||
// print known n-grams starting with token id (debug)
|
// print known n-grams starting with token id (debug)
|
||||||
if (0 && v == 0) {
|
if (0 && v == 0) {
|
||||||
if (ngrams_observed.cnt[id] > 0) {
|
if (ngrams_observed.cnt[id] > 0) {
|
||||||
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||||
printf(" - ngram %2d: ", i);
|
LOG(" - ngram %2d: ", i);
|
||||||
|
|
||||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||||
|
|
||||||
for (int j = 0; j < N - 1; j++) {
|
for (int j = 0; j < N - 1; j++) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||||
|
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -455,20 +452,20 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
auto t_dec_end = ggml_time_us();
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("W = %2d\n", W);
|
LOG_INF("W = %2d\n", W);
|
||||||
LOG_TEE("N = %2d\n", N);
|
LOG_INF("N = %2d\n", N);
|
||||||
LOG_TEE("G = %2d\n", G);
|
LOG_INF("G = %2d\n", G);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_INF("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
@ -482,7 +479,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -5,13 +5,12 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cinttypes>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
@ -20,6 +19,8 @@ int main(int argc, char ** argv){
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
@ -49,7 +50,7 @@ int main(int argc, char ** argv){
|
|||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,7 +129,7 @@ int main(int argc, char ** argv){
|
|||||||
const int64_t eta_min = eta_ms / (60*1000);
|
const int64_t eta_min = eta_ms / (60*1000);
|
||||||
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
||||||
|
|
||||||
LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
||||||
}
|
}
|
||||||
|
|
||||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||||
@ -136,24 +137,24 @@ int main(int argc, char ** argv){
|
|||||||
ngram_cache_context.clear();
|
ngram_cache_context.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_INF("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx);
|
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@ -18,17 +19,13 @@ int main(int argc, char ** argv){
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// max. number of additional tokens to draft if match is found
|
// max. number of additional tokens to draft if match is found
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("lookup", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -58,7 +55,7 @@ int main(int argc, char ** argv){
|
|||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -76,14 +73,14 @@ int main(int argc, char ** argv){
|
|||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
@ -124,7 +121,7 @@ int main(int argc, char ** argv){
|
|||||||
}
|
}
|
||||||
|
|
||||||
// print current draft sequence
|
// print current draft sequence
|
||||||
LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
|
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
||||||
|
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -136,7 +133,7 @@ int main(int argc, char ** argv){
|
|||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (!params.use_color) {
|
if (!params.use_color) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_is_eog(model, id)) {
|
if (llama_token_is_eog(model, id)) {
|
||||||
@ -147,7 +144,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
// check if the target token matches the draft
|
// check if the target token matches the draft
|
||||||
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
if (i_dft < (int) draft.size() && id == draft[i_dft]) {
|
||||||
LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
|
||||||
++n_accept;
|
++n_accept;
|
||||||
++n_past;
|
++n_past;
|
||||||
++i_dft;
|
++i_dft;
|
||||||
@ -161,19 +158,19 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
// color accepted draft token
|
// color accepted draft token
|
||||||
printf("\033[34m%s\033[0m", token_str.c_str());
|
LOG("\033[34m%s\033[0m", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
|
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||||
|
|
||||||
draft.clear();
|
draft.clear();
|
||||||
draft.push_back(id);
|
draft.push_back(id);
|
||||||
@ -224,22 +221,22 @@ int main(int argc, char ** argv){
|
|||||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_INF("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_INF("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||||
LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
||||||
LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
||||||
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n\n");
|
LOG_INF("\ntarget:\n\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
gpt_perf_print(ctx, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
@ -251,7 +248,7 @@ int main(int argc, char ** argv){
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "log.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cinttypes>
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
@ -42,11 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
|
|||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
static bool need_insert_eot = false;
|
static bool need_insert_eot = false;
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int argc, char ** argv) {
|
||||||
printf("\nexample usage:\n");
|
(void) argc;
|
||||||
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
|
||||||
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
LOG("\nexample usage:\n");
|
||||||
printf("\n");
|
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
||||||
|
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
||||||
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool file_exists(const std::string & path) {
|
static bool file_exists(const std::string & path) {
|
||||||
@ -74,8 +75,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
|
||||||
__func__, params.logdir.c_str());
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,7 +113,7 @@ static void sigint_handler(int signo) {
|
|||||||
need_insert_eot = true;
|
need_insert_eot = true;
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(*g_ctx, *g_smpl);
|
gpt_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
_exit(130);
|
_exit(130);
|
||||||
@ -122,17 +122,11 @@ static void sigint_handler(int signo) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||||
(void) level;
|
|
||||||
(void) user_data;
|
|
||||||
LOG_TEE("%s", text);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
|
||||||
llama_chat_msg new_msg{role, content};
|
llama_chat_msg new_msg{role, content};
|
||||||
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
chat_msgs.push_back({role, content});
|
chat_msgs.push_back({role, content});
|
||||||
LOG("formatted: %s\n", formatted.c_str());
|
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,55 +137,46 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("main", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// TODO: Dump params ?
|
|
||||||
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
|
||||||
|
|
||||||
// save choice to use color for later
|
// save choice to use color for later
|
||||||
// (note for later: this is a slightly awkward choice)
|
// (note for later: this is a slightly awkward choice)
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
atexit([]() { console::cleanup(); });
|
atexit([]() { console::cleanup(); });
|
||||||
|
|
||||||
if (params.logits_all) {
|
if (params.logits_all) {
|
||||||
printf("\n************\n");
|
LOG_ERR("************\n");
|
||||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.embedding) {
|
if (params.embedding) {
|
||||||
printf("\n************\n");
|
LOG_ERR("************\n");
|
||||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||||
printf("************\n\n");
|
LOG_ERR("************\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_base != 0.0) {
|
if (params.rope_freq_base != 0.0) {
|
||||||
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_scale != 0.0) {
|
if (params.rope_freq_scale != 0.0) {
|
||||||
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
LOG_INF("%s: llama backend init\n", __func__);
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -206,21 +191,19 @@ int main(int argc, char ** argv) {
|
|||||||
g_smpl = &smpl;
|
g_smpl = &smpl;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
LOG_TEE("%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s: llama threadpool init = n_threads = %d\n",
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||||
__func__,
|
|
||||||
(int) params.cpuparams.n_threads
|
|
||||||
);
|
|
||||||
struct ggml_threadpool_params tpp_batch =
|
struct ggml_threadpool_params tpp_batch =
|
||||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||||
struct ggml_threadpool_params tpp =
|
struct ggml_threadpool_params tpp =
|
||||||
@ -232,8 +215,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||||
if (!threadpool_batch) {
|
if (!threadpool_batch) {
|
||||||
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the non-batch threadpool in the paused state
|
// Start the non-batch threadpool in the paused state
|
||||||
@ -242,55 +225,54 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||||
if (!threadpool) {
|
if (!threadpool) {
|
||||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
||||||
__func__, n_ctx_train, n_ctx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// print chat template example in conversation mode
|
// print chat template example in conversation mode
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
if (params.enable_chat_template) {
|
if (params.enable_chat_template) {
|
||||||
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string path_session = params.path_prompt_cache;
|
std::string path_session = params.path_prompt_cache;
|
||||||
std::vector<llama_token> session_tokens;
|
std::vector<llama_token> session_tokens;
|
||||||
|
|
||||||
if (!path_session.empty()) {
|
if (!path_session.empty()) {
|
||||||
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||||
if (!file_exists(path_session)) {
|
if (!file_exists(path_session)) {
|
||||||
LOG_TEE("%s: session file does not exist, will create.\n", __func__);
|
LOG_INF("%s: session file does not exist, will create.\n", __func__);
|
||||||
} else if (file_is_empty(path_session)) {
|
} else if (file_is_empty(path_session)) {
|
||||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||||
} else {
|
} else {
|
||||||
// The file exists and is not empty
|
// The file exists and is not empty
|
||||||
session_tokens.resize(n_ctx);
|
session_tokens.resize(n_ctx);
|
||||||
size_t n_token_count_out = 0;
|
size_t n_token_count_out = 0;
|
||||||
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
session_tokens.resize(n_token_count_out);
|
session_tokens.resize(n_token_count_out);
|
||||||
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -298,7 +280,8 @@ int main(int argc, char ** argv) {
|
|||||||
if (!llama_model_has_encoder(model)) {
|
if (!llama_model_has_encoder(model)) {
|
||||||
GGML_ASSERT(!llama_add_eos_token(model));
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
}
|
}
|
||||||
LOG("add_bos: %d\n", add_bos);
|
|
||||||
|
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
@ -307,31 +290,31 @@ int main(int argc, char ** argv) {
|
|||||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
: params.prompt;
|
: params.prompt;
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG_DBG("tokenize the prompt\n");
|
||||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG_DBG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
if (add_bos) {
|
if (add_bos) {
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("error: input is empty\n");
|
LOG_ERR("input is empty\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,29 +328,28 @@ int main(int argc, char ** argv) {
|
|||||||
n_matching_session_tokens++;
|
n_matching_session_tokens++;
|
||||||
}
|
}
|
||||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||||
LOG_TEE("%s: using full prompt from session file\n", __func__);
|
LOG_INF("%s: using full prompt from session file\n", __func__);
|
||||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||||
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
|
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
||||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||||
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
} else {
|
} else {
|
||||||
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove any "future" tokens that we might have inherited from the previous session
|
// remove any "future" tokens that we might have inherited from the previous session
|
||||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGLN(
|
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||||
"recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
|
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
||||||
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
|
||||||
|
|
||||||
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
||||||
// reevaluation of the last token to recalculate the cached logits
|
// reevaluation of the last token to recalculate the cached logits
|
||||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
||||||
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
|
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
||||||
|
|
||||||
session_tokens.resize(embd_inp.size() - 1);
|
session_tokens.resize(embd_inp.size() - 1);
|
||||||
}
|
}
|
||||||
@ -389,21 +371,20 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
LOG_TEE("\n");
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > add_bos) {
|
if (params.n_keep > add_bos) {
|
||||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_TEE("'\n");
|
LOG("'\n");
|
||||||
}
|
}
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// ctrl+C handling
|
// ctrl+C handling
|
||||||
@ -423,40 +404,40 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
LOG_TEE("%s: interactive mode on.\n", __func__);
|
LOG("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
for (const auto & antiprompt : params.antiprompt) {
|
for (const auto & antiprompt : params.antiprompt) {
|
||||||
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG_TEE("Input prefix with BOS\n");
|
LOG("Input prefix with BOS\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -464,15 +445,15 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
smpl = gpt_sampler_init(model, sparams);
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
if (!smpl) {
|
if (!smpl) {
|
||||||
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
||||||
|
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
// group-attention state
|
// group-attention state
|
||||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||||
@ -486,9 +467,9 @@ int main(int argc, char ** argv) {
|
|||||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||||
}
|
}
|
||||||
LOG_TEE("\n\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char * control_message;
|
const char * control_message;
|
||||||
@ -500,11 +481,11 @@ int main(int argc, char ** argv) {
|
|||||||
" - To return control without starting a new line, end your input with '/'.\n"
|
" - To return control without starting a new line, end your input with '/'.\n"
|
||||||
" - If you want to submit another line, end your input with '\\'.\n";
|
" - If you want to submit another line, end your input with '\\'.\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("== Running in interactive mode. ==\n");
|
LOG("== Running in interactive mode. ==\n");
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
LOG( " - Press Ctrl+C to interject at any time.\n");
|
||||||
#endif
|
#endif
|
||||||
LOG_TEE( "%s\n", control_message);
|
LOG( "%s\n", control_message);
|
||||||
|
|
||||||
is_interacting = params.interactive_first;
|
is_interacting = params.interactive_first;
|
||||||
}
|
}
|
||||||
@ -543,7 +524,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_token * enc_input_buf = embd_inp.data();
|
llama_token * enc_input_buf = embd_inp.data();
|
||||||
|
|
||||||
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -569,9 +550,8 @@ int main(int argc, char ** argv) {
|
|||||||
embd.resize(max_embd_size);
|
embd.resize(max_embd_size);
|
||||||
|
|
||||||
console::set_display(console::error);
|
console::set_display(console::error);
|
||||||
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ga_n == 1) {
|
if (ga_n == 1) {
|
||||||
@ -581,14 +561,14 @@ int main(int argc, char ** argv) {
|
|||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() >= n_ctx) {
|
if (n_past + (int) embd.size() >= n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_left = n_past - params.n_keep;
|
const int n_left = n_past - params.n_keep;
|
||||||
const int n_discard = n_left/2;
|
const int n_discard = n_left/2;
|
||||||
|
|
||||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||||
@ -596,11 +576,11 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
n_past -= n_discard;
|
n_past -= n_discard;
|
||||||
|
|
||||||
LOG("after swap: n_past = %d\n", n_past);
|
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||||
|
|
||||||
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
LOG("clear session path\n");
|
LOG_DBG("clear session path\n");
|
||||||
path_session.clear();
|
path_session.clear();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -610,10 +590,10 @@ int main(int argc, char ** argv) {
|
|||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||||
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
||||||
|
|
||||||
LOG("\n");
|
LOG_DBG("\n");
|
||||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
||||||
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
||||||
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
||||||
|
|
||||||
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
||||||
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
||||||
@ -623,7 +603,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
ga_i += ga_w/ga_n;
|
ga_i += ga_w/ga_n;
|
||||||
|
|
||||||
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -655,19 +635,19 @@ int main(int argc, char ** argv) {
|
|||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
|
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
|
|
||||||
LOG("n_past = %d\n", n_past);
|
LOG_DBG("n_past = %d\n", n_past);
|
||||||
// Display total tokens alongside total time
|
// Display total tokens alongside total time
|
||||||
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
||||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -685,14 +665,14 @@ int main(int argc, char ** argv) {
|
|||||||
need_to_save_session = false;
|
need_to_save_session = false;
|
||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
|
|
||||||
LOG("saved session to %s\n", path_session.c_str());
|
LOG_DBG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
|
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
||||||
|
|
||||||
// LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
embd.push_back(id);
|
embd.push_back(id);
|
||||||
|
|
||||||
@ -702,16 +682,16 @@ int main(int argc, char ** argv) {
|
|||||||
// decrement remaining sampling budget
|
// decrement remaining sampling budget
|
||||||
--n_remain;
|
--n_remain;
|
||||||
|
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
// some user input remains from prompt or interaction, forward it to processing
|
||||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||||
while ((int) embd_inp.size() > n_consumed) {
|
while ((int) embd_inp.size() > n_consumed) {
|
||||||
embd.push_back(embd_inp[n_consumed]);
|
embd.push_back(embd_inp[n_consumed]);
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
|
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
@ -726,7 +706,7 @@ int main(int argc, char ** argv) {
|
|||||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||||
|
|
||||||
// Console/Stream Output
|
// Console/Stream Output
|
||||||
fprintf(stdout, "%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
|
||||||
// Record Displayed Tokens To Log
|
// Record Displayed Tokens To Log
|
||||||
// Note: Generated tokens are created one by one hence this check
|
// Note: Generated tokens are created one by one hence this check
|
||||||
@ -738,8 +718,6 @@ int main(int argc, char ** argv) {
|
|||||||
output_tokens.push_back(id);
|
output_tokens.push_back(id);
|
||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -788,13 +766,13 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (is_antiprompt) {
|
if (is_antiprompt) {
|
||||||
LOG("found antiprompt: %s\n", last_output.c_str());
|
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
||||||
LOG("found an EOG token\n");
|
LOG_DBG("found an EOG token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
@ -808,7 +786,7 @@ int main(int argc, char ** argv) {
|
|||||||
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
||||||
}
|
}
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -819,21 +797,21 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG_DBG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
printf("\n> ");
|
LOG("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
LOG("adding input prefix BOS token\n");
|
LOG_DBG("adding input prefix BOS token\n");
|
||||||
embd_inp.push_back(llama_token_bos(model));
|
embd_inp.push_back(llama_token_bos(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty() && !params.conversation) {
|
if (!params.input_prefix.empty() && !params.conversation) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
printf("%s", params.input_prefix.c_str());
|
LOG("%s", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// color user input only
|
// color user input only
|
||||||
@ -856,11 +834,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty() && !params.conversation) {
|
if (!params.input_suffix.empty() && !params.conversation) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
printf("%s", params.input_suffix.c_str());
|
LOG("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("buffer: '%s'\n", buffer.c_str());
|
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
@ -877,7 +855,7 @@ int main(int argc, char ** argv) {
|
|||||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
// if user stop generation mid-way, we must add EOT to finish model's last response
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
||||||
if (need_insert_eot && format_chat) {
|
if (need_insert_eot && format_chat) {
|
||||||
@ -900,9 +878,9 @@ int main(int argc, char ** argv) {
|
|||||||
assistant_ss.str("");
|
assistant_ss.str("");
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG_DBG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
LOG("empty line, passing control back\n");
|
LOG_DBG("empty line, passing control back\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
input_echo = false; // do not echo this again
|
input_echo = false; // do not echo this again
|
||||||
@ -918,7 +896,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// end of generation
|
// end of generation
|
||||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -931,11 +909,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||||
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||||
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
gpt_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
@ -949,9 +927,5 @@ int main(int argc, char ** argv) {
|
|||||||
ggml_threadpool_free(threadpool);
|
ggml_threadpool_free(threadpool);
|
||||||
ggml_threadpool_free(threadpool_batch);
|
ggml_threadpool_free(threadpool_batch);
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
LOG_TEE("Log end\n");
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -83,7 +84,9 @@ static void print_date_time() {
|
|||||||
char buffer[80];
|
char buffer[80];
|
||||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
||||||
|
|
||||||
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
LOG_INF("\n");
|
||||||
|
LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
|
||||||
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define a split string function to ...
|
// Define a split string function to ...
|
||||||
@ -106,6 +109,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// number of simultaneous "clients" to simulate
|
// number of simultaneous "clients" to simulate
|
||||||
const int32_t n_clients = params.n_parallel;
|
const int32_t n_clients = params.n_parallel;
|
||||||
|
|
||||||
@ -120,12 +125,6 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const bool dump_kv_cache = params.dump_kv_cache;
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("parallel", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -138,23 +137,22 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// load the prompts from an external file if there are any
|
// load the prompts from an external file if there are any
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
||||||
} else {
|
} else {
|
||||||
// Output each line of the input params.prompts vector and copy to k_prompts
|
// Output each line of the input params.prompts vector and copy to k_prompts
|
||||||
int index = 0;
|
int index = 0;
|
||||||
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
||||||
|
|
||||||
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
||||||
for (const auto& prompt : prompts) {
|
for (const auto& prompt : prompts) {
|
||||||
k_prompts.resize(index + 1);
|
k_prompts.resize(index + 1);
|
||||||
k_prompts[index] = prompt;
|
k_prompts[index] = prompt;
|
||||||
index++;
|
index++;
|
||||||
printf("%3d prompt: %s\n", index, prompt.c_str());
|
LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG_INF("\n\n");
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
@ -183,19 +181,19 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,10 +202,10 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_INF("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
@ -238,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: clearing the KV cache\n", __func__);
|
LOG_INF("%s: clearing the KV cache\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// insert new sequences for decoding
|
// insert new sequences for decoding
|
||||||
@ -273,7 +271,7 @@ int main(int argc, char ** argv) {
|
|||||||
client.n_decoded = 0;
|
client.n_decoded = 0;
|
||||||
client.i_batch = batch.n_tokens - 1;
|
client.i_batch = batch.n_tokens - 1;
|
||||||
|
|
||||||
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
||||||
|
|
||||||
g_seq_id += 1;
|
g_seq_id += 1;
|
||||||
|
|
||||||
@ -317,11 +315,11 @@ int main(int argc, char ** argv) {
|
|||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
|
||||||
|
|
||||||
n_cache_miss += 1;
|
n_cache_miss += 1;
|
||||||
|
|
||||||
@ -332,7 +330,7 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
|
||||||
|
|
||||||
for (auto & client : clients) {
|
for (auto & client : clients) {
|
||||||
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
|
||||||
@ -377,7 +375,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
||||||
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
||||||
(t_main_end - client.t_start_prompt) / 1e6,
|
(t_main_end - client.t_start_prompt) / 1e6,
|
||||||
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
||||||
@ -400,19 +398,19 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
print_date_time();
|
print_date_time();
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
||||||
if (params.prompt_file.empty()) {
|
if (params.prompt_file.empty()) {
|
||||||
params.prompt_file = "used built-in defaults";
|
params.prompt_file = "used built-in defaults";
|
||||||
}
|
}
|
||||||
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
||||||
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
||||||
|
|
||||||
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
LOG_INF("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
||||||
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
LOG_INF("Cache misses: %6d\n", n_cache_miss);
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
// TODO: print sampling/grammar timings for all clients
|
// TODO: print sampling/grammar timings for all clients
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
@ -424,7 +422,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -8,9 +9,9 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
LOG("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -24,6 +25,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
int n_junk = params.n_junk;
|
int n_junk = params.n_junk;
|
||||||
int n_keep = params.n_keep;
|
int n_keep = params.n_keep;
|
||||||
int n_grp = params.grp_attn_n;
|
int n_grp = params.grp_attn_n;
|
||||||
@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
|
|||||||
const int n_batch = ctx_params.n_batch;
|
const int n_batch = ctx_params.n_batch;
|
||||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
|
LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
|
||||||
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
LOG_INF("prompt tokens: %d\n", n_tokens_all);
|
||||||
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
//LOG_INF("prompt: %s\n", params.prompt.c_str());
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
||||||
|
|
||||||
@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_INF("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||||
|
|
||||||
if (i + n_batch >= n_tokens_all) {
|
if (i + n_batch >= n_tokens_all) {
|
||||||
break;
|
break;
|
||||||
@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
||||||
const int n_discard = n_batch;
|
const int n_discard = n_batch;
|
||||||
|
|
||||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
const int n_discard = n_past - n_ctx + n_predict;
|
const int n_discard = n_past - n_ctx + n_predict;
|
||||||
|
|
||||||
if (n_discard > 0) {
|
if (n_discard > 0) {
|
||||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||||
@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
// main loop
|
// main loop
|
||||||
|
|
||||||
int n_cur = n_tokens_all;
|
int n_cur = n_tokens_all;
|
||||||
int n_decode = 0;
|
int n_decode = 0;
|
||||||
|
|
||||||
LOG_TEE("%s", prompt_suffix.c_str());
|
LOG_INF("%s", prompt_suffix.c_str());
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
@ -222,13 +224,12 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
|
|
||||||
@ -243,22 +244,22 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
// evaluate the current batch with the transformer model
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
|
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -41,7 +43,7 @@ static void write_logfile(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.hellaswag) {
|
if (params.hellaswag) {
|
||||||
fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -49,7 +51,7 @@ static void write_logfile(
|
|||||||
|
|
||||||
const bool success = fs_create_directory_with_parents(params.logdir);
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
|
||||||
__func__, params.logdir.c_str());
|
__func__, params.logdir.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -58,7 +60,7 @@ static void write_logfile(
|
|||||||
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
||||||
|
|
||||||
if (logfile == NULL) {
|
if (logfile == NULL) {
|
||||||
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||||
n_ctx);
|
n_ctx);
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||||
return {std::move(tokens), 0., {}, {}};
|
return {std::move(tokens), 0., {}, {}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
prob_history.resize(tokens.size());
|
prob_history.resize(tokens.size());
|
||||||
|
|
||||||
if (params.ppl_stride <= 0) {
|
if (params.ppl_stride <= 0) {
|
||||||
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
const int calc_chunk = n_ctx;
|
const int calc_chunk = n_ctx;
|
||||||
|
|
||||||
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||||
|
|
||||||
if (int(tokens.size()) <= calc_chunk) {
|
if (int(tokens.size()) <= calc_chunk) {
|
||||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||||
tokens.size(), n_ctx, params.ppl_stride);
|
tokens.size(), n_ctx, params.ppl_stride);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * params.ppl_stride;
|
const int start = i * params.ppl_stride;
|
||||||
const int end = start + calc_chunk;
|
const int end = start + calc_chunk;
|
||||||
|
|
||||||
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
||||||
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
|
|
||||||
@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
const int batch_start = start + j * n_batch;
|
const int batch_start = start + j * n_batch;
|
||||||
const int batch_size = std::min(end - batch_start, n_batch);
|
const int batch_size = std::min(end - batch_start, n_batch);
|
||||||
|
|
||||||
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
||||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
//fprintf(stderr, "%s : failed to eval\n", __func__);
|
//LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||||
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
||||||
|
|
||||||
// Calculate probability of next token, given the previous ones.
|
// Calculate probability of next token, given the previous ones.
|
||||||
@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|||||||
}
|
}
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||||
}
|
}
|
||||||
@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
if (!params.logits_file.empty()) {
|
if (!params.logits_file.empty()) {
|
||||||
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
||||||
if (!logits_stream.is_open()) {
|
if (!logits_stream.is_open()) {
|
||||||
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||||
logits_stream.write("_logits_", 8);
|
logits_stream.write("_logits_", 8);
|
||||||
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||||
n_ctx);
|
n_ctx);
|
||||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||||
return {std::move(tokens), 0., {}, {}};
|
return {std::move(tokens), 0., {}, {}};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
logits.reserve((size_t)n_ctx * n_vocab);
|
logits.reserve((size_t)n_ctx * n_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_INF("%s : failed to eval\n", __func__);
|
||||||
return {tokens, -1, logit_history, prob_history};
|
return {tokens, -1, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
|
||||||
for (int seq = 0; seq < n_seq_batch; seq++) {
|
for (int seq = 0; seq < n_seq_batch; seq++) {
|
||||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
|
||||||
@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
|
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
double av = nll/count;
|
double av = nll/count;
|
||||||
double av2 = nll2/count - av*av;
|
double av2 = nll2/count - av*av;
|
||||||
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
nll2 /= count;
|
nll2 /= count;
|
||||||
nll /= count;
|
nll /= count;
|
||||||
@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|||||||
nll2 -= nll * nll;
|
nll2 -= nll * nll;
|
||||||
if (nll2 > 0) {
|
if (nll2 > 0) {
|
||||||
nll2 = sqrt(nll2/(count-1));
|
nll2 = sqrt(nll2/(count-1));
|
||||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
} else {
|
} else {
|
||||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|||||||
|
|
||||||
const int ret = llama_decode(ctx, batch_view);
|
const int ret = llama_decode(ctx, batch_view);
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (prompt_lines.size() % 6 != 0) {
|
if (prompt_lines.size() % 6 != 0) {
|
||||||
fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t hs_task_count = prompt_lines.size()/6;
|
size_t hs_task_count = prompt_lines.size()/6;
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
LOG_INF("================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// The tasks should be randomized so the score stabilizes quickly.
|
// The tasks should be randomized so the score stabilizes quickly.
|
||||||
bool randomize_tasks = true;
|
bool randomize_tasks = true;
|
||||||
@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
std::vector<llama_token> seq_tokens[4];
|
std::vector<llama_token> seq_tokens[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
||||||
|
|
||||||
// Select and read data from prompt lines
|
// Select and read data from prompt lines
|
||||||
std::vector<hs_data_t> hs_data(hs_task_count);
|
std::vector<hs_data_t> hs_data(hs_task_count);
|
||||||
@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
|
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
printf("\ntask\tacc_norm\n");
|
LOG("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
|
|
||||||
@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
||||||
|
|
||||||
// If the gold ending got the maximum logprobe add one accuracy point
|
// If the gold ending got the maximum logprobe add one accuracy point
|
||||||
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
||||||
@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Print the accumulated accuracy mean x 100
|
// Print the accumulated accuracy mean x 100
|
||||||
printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct winogrande_entry {
|
struct winogrande_entry {
|
||||||
@ -1060,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ipos != 4) {
|
if (ipos != 4) {
|
||||||
printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
||||||
@ -1074,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|||||||
if (sentence[where] == '_') break;
|
if (sentence[where] == '_') break;
|
||||||
}
|
}
|
||||||
if (where == int(sentence.size())) {
|
if (where == int(sentence.size())) {
|
||||||
printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
std::istringstream stream(answer.c_str());
|
std::istringstream stream(answer.c_str());
|
||||||
int i_answer; stream >> i_answer;
|
int i_answer; stream >> i_answer;
|
||||||
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
||||||
printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
result.emplace_back();
|
result.emplace_back();
|
||||||
@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
auto data = load_winogrande_from_csv(params.prompt);
|
auto data = load_winogrande_from_csv(params.prompt);
|
||||||
if (data.empty()) {
|
if (data.empty()) {
|
||||||
fprintf(stderr, "%s: no tasks\n", __func__);
|
LOG_ERR("%s: no tasks\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
||||||
|
|
||||||
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
||||||
fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
||||||
std::mt19937 rng(1);
|
std::mt19937 rng(1);
|
||||||
std::vector<int> aux(data.size());
|
std::vector<int> aux(data.size());
|
||||||
for (int i = 0; i < int(data.size()); ++i) {
|
for (int i = 0; i < int(data.size()); ++i) {
|
||||||
@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
data = std::move(selected);
|
data = std::move(selected);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
|
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|||||||
++n_done;
|
++n_done;
|
||||||
|
|
||||||
// print the accumulated accuracy mean x 100
|
// print the accumulated accuracy mean x 100
|
||||||
printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (n_done < 100) return;
|
if (n_done < 100) return;
|
||||||
|
|
||||||
const float p = 1.f*n_correct/n_done;
|
const float p = 1.f*n_correct/n_done;
|
||||||
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
|
||||||
|
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool deserialize_string(std::istream & in, std::string & str) {
|
static bool deserialize_string(std::istream & in, std::string & str) {
|
||||||
@ -1348,7 +1349,7 @@ struct multiple_choice_task {
|
|||||||
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
||||||
if (task.question.empty() || task.mc1.answers.empty()) {
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found bad task with empty question and/or answers\n", __func__);
|
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|||||||
for (auto& answer : task.mc1.answers) {
|
for (auto& answer : task.mc1.answers) {
|
||||||
if (answer.empty()) {
|
if (answer.empty()) {
|
||||||
if (log_error) {
|
if (log_error) {
|
||||||
printf("%s: found empty answer\n", __func__);
|
LOG_ERR("%s: found empty answer\n", __func__);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
uint32_t n_task;
|
uint32_t n_task;
|
||||||
strstream.read((char *)&n_task, sizeof(n_task));
|
strstream.read((char *)&n_task, sizeof(n_task));
|
||||||
if (strstream.fail() || n_task == 0) {
|
if (strstream.fail() || n_task == 0) {
|
||||||
printf("%s: no tasks\n", __func__);
|
LOG_ERR("%s: no tasks\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printf("%s: there are %u tasks in prompt\n", __func__, n_task);
|
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
|
||||||
std::vector<uint32_t> task_pos(n_task);
|
std::vector<uint32_t> task_pos(n_task);
|
||||||
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
||||||
if (strstream.fail()) {
|
if (strstream.fail()) {
|
||||||
printf("%s: failed to read task positions from prompt\n", __func__);
|
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
||||||
// Use all tasks
|
// Use all tasks
|
||||||
tasks.resize(n_task);
|
tasks.resize(n_task);
|
||||||
printf("%s: reading tasks", __func__);
|
LOG_INF("%s: reading tasks", __func__);
|
||||||
int n_dot = std::max((int) n_task/100, 1);
|
int n_dot = std::max((int) n_task/100, 1);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (auto& task : tasks) {
|
for (auto& task : tasks) {
|
||||||
++i;
|
++i;
|
||||||
if (!task.deserialize(strstream)) {
|
if (!task.deserialize(strstream)) {
|
||||||
printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i%n_dot == 0) printf(".");
|
if (i%n_dot == 0) LOG(".");
|
||||||
}
|
}
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
||||||
std::mt19937 rng(1);
|
std::mt19937 rng(1);
|
||||||
std::vector<int> aux(n_task);
|
std::vector<int> aux(n_task);
|
||||||
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
||||||
@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
aux.pop_back();
|
aux.pop_back();
|
||||||
strstream.seekg(task_pos[idx], std::ios::beg);
|
strstream.seekg(task_pos[idx], std::ios::beg);
|
||||||
if (!task.deserialize(strstream)) {
|
if (!task.deserialize(strstream)) {
|
||||||
printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n_task = params.multiple_choice_tasks;
|
n_task = params.multiple_choice_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: preparing task data", __func__);
|
LOG_INF("%s: preparing task data", __func__);
|
||||||
fflush(stdout);
|
|
||||||
if (n_task > 500) {
|
if (n_task > 500) {
|
||||||
printf("...");
|
LOG("...");
|
||||||
fflush(stdout);
|
|
||||||
std::atomic<int> counter(0);
|
std::atomic<int> counter(0);
|
||||||
std::atomic<int> n_bad(0);
|
std::atomic<int> n_bad(0);
|
||||||
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
||||||
@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
for (auto& w : workers) w = std::thread(prepare);
|
for (auto& w : workers) w = std::thread(prepare);
|
||||||
prepare();
|
prepare();
|
||||||
for (auto& w : workers) w.join();
|
for (auto& w : workers) w.join();
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
fflush(stdout);
|
|
||||||
int nbad = n_bad;
|
int nbad = n_bad;
|
||||||
if (nbad > 0) {
|
if (nbad > 0) {
|
||||||
printf("%s: found %d malformed tasks\n", __func__, nbad);
|
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (i_task%n_dot == 0) {
|
if (i_task%n_dot == 0) {
|
||||||
printf(".");
|
LOG(".");
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("done\n");
|
LOG("done\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
||||||
|
|
||||||
printf("\ntask\tacc_norm\n");
|
LOG("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i0 == i1) {
|
if (i0 == i1) {
|
||||||
fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
|
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
|
|
||||||
// decode all tasks [i0, i1)
|
// decode all tasks [i0, i1)
|
||||||
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
||||||
fprintf(stderr, "%s: llama_decode() failed\n", __func__);
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
// compute the logprobs for each ending of the decoded tasks
|
// compute the logprobs for each ending of the decoded tasks
|
||||||
for (size_t i = i0; i < i1; ++i) {
|
for (size_t i = i0; i < i1; ++i) {
|
||||||
auto & cur_task = tasks[i];
|
auto & cur_task = tasks[i];
|
||||||
//printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
||||||
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
||||||
// if (cur_task.mc1.labels[j] == 1) {
|
// if (cur_task.mc1.labels[j] == 1) {
|
||||||
// printf("%d", j+1);
|
// LOG("%d", j+1);
|
||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
//printf("\n common_prefix: %zu\n", cur_task.common_prefix);
|
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
||||||
|
|
||||||
// get the logits of the last token of the common prefix
|
// get the logits of the last token of the common prefix
|
||||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
||||||
@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
size_t count = 1;
|
size_t count = 1;
|
||||||
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
||||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||||
//printf(" %zu %g\n", ir, eval_results[ir]);
|
//LOG(" %zu %g\n", ir, eval_results[ir]);
|
||||||
++count;
|
++count;
|
||||||
log_prob += eval_results[ir++];
|
log_prob += eval_results[ir++];
|
||||||
}
|
}
|
||||||
cur_task.log_probs[s] = log_prob / count;
|
cur_task.log_probs[s] = log_prob / count;
|
||||||
//printf(" Final: %g\n", log_prob / count);
|
//LOG(" Final: %g\n", log_prob / count);
|
||||||
//printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the ending with maximum logprob
|
// Find the ending with maximum logprob
|
||||||
@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
++n_done;
|
++n_done;
|
||||||
|
|
||||||
// Print the accumulated accuracy mean x 100
|
// Print the accumulated accuracy mean x 100
|
||||||
printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
i0 = i1 - 1;
|
i0 = i1 - 1;
|
||||||
@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|||||||
|
|
||||||
float p = 1.f*n_correct/n_done;
|
float p = 1.f*n_correct/n_done;
|
||||||
float sigma = sqrt(p*(1-p)/(n_done-1));
|
float sigma = sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
LOG("\n");
|
||||||
|
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||||
p = 1.f*n_done/n_tot_answers;
|
p = 1.f*n_done/n_tot_answers;
|
||||||
sigma = sqrt(p*(1-p)/(n_done-1));
|
sigma = sqrt(p*(1-p)/(n_done-1));
|
||||||
printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
||||||
|
|
||||||
printf("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
if (params.logits_file.empty()) {
|
if (params.logits_file.empty()) {
|
||||||
fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
||||||
if (!in) {
|
if (!in) {
|
||||||
fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
char check[9]; check[8] = 0;
|
char check[9]; check[8] = 0;
|
||||||
in.read(check, 8);
|
in.read(check, 8);
|
||||||
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
||||||
fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
uint32_t n_ctx;
|
uint32_t n_ctx;
|
||||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||||
if (n_ctx > llama_n_ctx(ctx)) {
|
if (n_ctx > llama_n_ctx(ctx)) {
|
||||||
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
in.read((char *)&n_vocab, sizeof(n_vocab));
|
in.read((char *)&n_vocab, sizeof(n_vocab));
|
||||||
in.read((char *)&n_chunk, sizeof(n_chunk));
|
in.read((char *)&n_chunk, sizeof(n_chunk));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
||||||
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
||||||
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
||||||
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
||||||
fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
|
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
|
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
||||||
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
||||||
int total_seconds = (int)(t_total * n_chunk);
|
int total_seconds = (int)(t_total * n_chunk);
|
||||||
if (total_seconds >= 60*60) {
|
if (total_seconds >= 60*60) {
|
||||||
fprintf(stderr, "%d hours ", total_seconds / (60*60));
|
LOG("%d hours ", total_seconds / (60*60));
|
||||||
total_seconds = total_seconds % (60*60);
|
total_seconds = total_seconds % (60*60);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
LOG("%.2f minutes\n", total_seconds / 60.0);
|
||||||
|
|
||||||
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
|
||||||
}
|
}
|
||||||
|
LOG("\n");
|
||||||
|
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
||||||
|
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
p_diff_ptr += n_ctx - 1 - first;
|
p_diff_ptr += n_ctx - 1 - first;
|
||||||
kld_ptr += n_ctx - 1 - first;
|
kld_ptr += n_ctx - 1 - first;
|
||||||
|
|
||||||
printf("%4d", i+1);
|
LOG("%4d", i+1);
|
||||||
|
|
||||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||||
const double ppl_val = exp(log_ppl.first);
|
const double ppl_val = exp(log_ppl.first);
|
||||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||||
printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
||||||
|
|
||||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||||
printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||||
|
|
||||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||||
printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
||||||
|
|
||||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||||
|
|
||||||
double p_top_val = 1.*kld.n_same_top/kld.count;
|
double p_top_val = 1.*kld.n_same_top/kld.count;
|
||||||
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
||||||
printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
fflush(stdout);
|
|
||||||
|
|
||||||
logits.clear();
|
logits.clear();
|
||||||
}
|
}
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
||||||
|
|
||||||
std::sort(kld_values.begin(), kld_values.end());
|
std::sort(kld_values.begin(), kld_values.end());
|
||||||
std::sort(p_diff_values.begin(), p_diff_values.end());
|
std::sort(p_diff_values.begin(), p_diff_values.end());
|
||||||
|
|
||||||
printf("====== Perplexity statistics ======\n");
|
LOG("====== Perplexity statistics ======\n");
|
||||||
|
|
||||||
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||||
const double ppl_val = exp(log_ppl.first);
|
const double ppl_val = exp(log_ppl.first);
|
||||||
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
||||||
printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
||||||
|
|
||||||
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
||||||
const double ppl_base_val = exp(log_ppl_base.first);
|
const double ppl_base_val = exp(log_ppl_base.first);
|
||||||
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
||||||
printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
||||||
|
|
||||||
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
||||||
// printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
||||||
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
||||||
printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
||||||
|
|
||||||
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
||||||
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
||||||
printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
||||||
|
|
||||||
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
||||||
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
||||||
printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
||||||
|
|
||||||
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
||||||
const double ppl_diff_val = ppl_val - ppl_base_val;
|
const double ppl_diff_val = ppl_val - ppl_base_val;
|
||||||
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
||||||
printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
printf("====== KL divergence statistics ======\n");
|
LOG("====== KL divergence statistics ======\n");
|
||||||
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
||||||
printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
||||||
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
||||||
: kld_values[kld_values.size()/2];
|
: kld_values[kld_values.size()/2];
|
||||||
|
|
||||||
@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|||||||
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
||||||
};
|
};
|
||||||
|
|
||||||
printf("Maximum KLD: %10.6f\n", kld_values.back());
|
LOG("Maximum KLD: %10.6f\n", kld_values.back());
|
||||||
printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
||||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||||
printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||||
printf("Median KLD: %10.6f\n", kld_median);
|
LOG("Median KLD: %10.6f\n", kld_median);
|
||||||
printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
||||||
printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
||||||
printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
||||||
printf("Minimum KLD: %10.6f\n", kld_values.front());
|
LOG("Minimum KLD: %10.6f\n", kld_values.front());
|
||||||
|
|
||||||
printf("\n");
|
LOG("\n");
|
||||||
|
|
||||||
printf("====== Token probability statistics ======\n");
|
LOG("====== Token probability statistics ======\n");
|
||||||
|
|
||||||
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
||||||
printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
||||||
|
|
||||||
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
||||||
: p_diff_values[p_diff_values.size()/2];
|
: p_diff_values[p_diff_values.size()/2];
|
||||||
|
|
||||||
printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
||||||
printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
||||||
printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
||||||
printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
||||||
printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
||||||
printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
||||||
printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
||||||
printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
||||||
printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
||||||
printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
||||||
printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
||||||
printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
||||||
printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
||||||
|
|
||||||
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
||||||
// printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
||||||
|
|
||||||
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
||||||
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
||||||
printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
||||||
|
|
||||||
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
||||||
printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
const int32_t n_ctx = params.n_ctx;
|
const int32_t n_ctx = params.n_ctx;
|
||||||
|
|
||||||
if (n_ctx <= 0) {
|
if (n_ctx <= 0) {
|
||||||
fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (params.ppl_stride > 0) {
|
if (params.ppl_stride > 0) {
|
||||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||||
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||||
params.n_ctx += params.ppl_stride/2;
|
params.n_ctx += params.ppl_stride/2;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) {
|
|||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
|
|
||||||
if (params.n_ctx > n_ctx_train) {
|
if (params.n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, params.n_ctx);
|
__func__, n_ctx_train, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
struct results_perplexity results;
|
struct results_perplexity results;
|
||||||
@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) {
|
|||||||
results = perplexity(ctx, params, n_ctx);
|
results = perplexity(ctx, params, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
write_logfile(ctx, params, model, results);
|
write_logfile(ctx, params, model, results);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -1,14 +1,16 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <iostream> // TODO: remove me
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
LOG("\n %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct chunk {
|
struct chunk {
|
||||||
@ -17,7 +19,7 @@ struct chunk {
|
|||||||
// original file position
|
// original file position
|
||||||
size_t filepos;
|
size_t filepos;
|
||||||
// original text data
|
// original text data
|
||||||
std::string textdata = "";
|
std::string textdata;
|
||||||
// tokenized text data
|
// tokenized text data
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
// embedding
|
// embedding
|
||||||
@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
|||||||
std::ifstream f(filename.c_str());
|
std::ifstream f(filename.c_str());
|
||||||
|
|
||||||
if (!f.is_open()) {
|
if (!f.is_open()) {
|
||||||
fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
|
LOG_ERR("could not open file %s\n", filename.c_str());
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
chunk current_chunk;
|
chunk current_chunk;
|
||||||
char buffer[1024];
|
char buffer[1024];
|
||||||
int64_t filepos = 0;
|
int64_t filepos = 0;
|
||||||
std::string current = "";
|
std::string current;
|
||||||
while (f.read(buffer, 1024)) {
|
while (f.read(buffer, 1024)) {
|
||||||
current += std::string(buffer, f.gcount());
|
current += std::string(buffer, f.gcount());
|
||||||
size_t pos;
|
size_t pos;
|
||||||
@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
||||||
if (llama_decode(ctx, batch) < 0) {
|
if (llama_decode(ctx, batch) < 0) {
|
||||||
fprintf(stderr, "%s : failed to decode\n", __func__);
|
LOG_ERR("%s : failed to decode\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; i++) {
|
for (int i = 0; i < batch.n_tokens; i++) {
|
||||||
@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
embd = llama_get_embeddings_ith(ctx, i);
|
embd = llama_get_embeddings_ith(ctx, i);
|
||||||
if (embd == NULL) {
|
if (embd == NULL) {
|
||||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -116,24 +118,24 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// For BERT models, batch size must be equal to ubatch size
|
// For BERT models, batch size must be equal to ubatch size
|
||||||
params.n_ubatch = params.n_batch;
|
params.n_ubatch = params.n_batch;
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
if (params.chunk_size <= 0) {
|
if (params.chunk_size <= 0) {
|
||||||
fprintf(stderr, "chunk_size must be positive\n");
|
LOG_ERR("chunk_size must be positive\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
if (params.context_files.empty()) {
|
if (params.context_files.empty()) {
|
||||||
fprintf(stderr, "context_files must be specified\n");
|
LOG_ERR("context_files must be specified\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
print_build_info();
|
LOG_INF("processing files:\n");
|
||||||
|
|
||||||
printf("processing files:\n");
|
|
||||||
for (auto & context_file : params.context_files) {
|
for (auto & context_file : params.context_files) {
|
||||||
printf("%s\n", context_file.c_str());
|
LOG_INF("%s\n", context_file.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<chunk> chunks;
|
std::vector<chunk> chunks;
|
||||||
@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
|
|||||||
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
|
||||||
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
|
||||||
}
|
}
|
||||||
printf("Number of chunks: %ld\n", chunks.size());
|
LOG_INF("Number of chunks: %ld\n", chunks.size());
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_ERR("%s: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
LOG_ERR("%s: pooling type NONE not supported\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_INF("\n");
|
||||||
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
|
|||||||
for (auto & chunk : chunks) {
|
for (auto & chunk : chunks) {
|
||||||
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
|
|||||||
// tokenization stats
|
// tokenization stats
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
for (int i = 0; i < (int) chunks.size(); i++) {
|
for (int i = 0; i < (int) chunks.size(); i++) {
|
||||||
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||||
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n\n");
|
LOG_INF("\n\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
|
|||||||
// start loop, receive query and return top k similar chunks based on cosine similarity
|
// start loop, receive query and return top k similar chunks based on cosine similarity
|
||||||
std::string query;
|
std::string query;
|
||||||
while (true) {
|
while (true) {
|
||||||
printf("Enter query: ");
|
LOG("Enter query: ");
|
||||||
std::getline(std::cin, query);
|
std::getline(std::cin, query);
|
||||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
||||||
|
|
||||||
@ -280,18 +282,18 @@ int main(int argc, char ** argv) {
|
|||||||
return a.second > b.second;
|
return a.second > b.second;
|
||||||
});
|
});
|
||||||
|
|
||||||
printf("Top %d similar chunks:\n", params.sparams.top_k);
|
LOG("Top %d similar chunks:\n", params.sparams.top_k);
|
||||||
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
|
||||||
printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
|
||||||
printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
|
||||||
printf("similarity: %f\n", similarities[i].second);
|
LOG("similarity: %f\n", similarities[i].second);
|
||||||
printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
|
||||||
printf("--------------------\n");
|
LOG("--------------------\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
set(TARGET llama-server)
|
set(TARGET llama-server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
|
||||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
@ -46,9 +46,6 @@ endforeach()
|
|||||||
|
|
||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
|
@ -121,7 +121,6 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
|
@ -40,7 +40,6 @@ server --host localhost --port 8080 \
|
|||||||
--parallel 8 \
|
--parallel 8 \
|
||||||
--batch-size 512 \
|
--batch-size 512 \
|
||||||
--ctx-size 4096 \
|
--ctx-size 4096 \
|
||||||
--log-format text \
|
|
||||||
-ngl 33
|
-ngl 33
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -272,7 +272,6 @@ def start_server_background(args):
|
|||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
server_args.append('--metrics')
|
server_args.append('--metrics')
|
||||||
server_args.append('--flash-attn')
|
server_args.append('--flash-attn')
|
||||||
server_args.extend(['--log-format', "text"])
|
|
||||||
args = [str(arg) for arg in [server_path, *server_args]]
|
args = [str(arg) for arg in [server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
pkwargs = {
|
pkwargs = {
|
||||||
|
File diff suppressed because it is too large
Load Diff
1
examples/server/tests/.gitignore
vendored
Normal file
1
examples/server/tests/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
.venv
|
@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
|
|||||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
|
||||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||||
|
|
||||||
### Run @bug, @wip or @wrong_usage annotated scenario
|
### Run @bug, @wip or @wrong_usage annotated scenario
|
||||||
|
@ -1372,8 +1372,6 @@ def start_server_background(context):
|
|||||||
server_args.append('--verbose')
|
server_args.append('--verbose')
|
||||||
if context.lora_file:
|
if context.lora_file:
|
||||||
server_args.extend(['--lora', context.lora_file])
|
server_args.extend(['--lora', context.lora_file])
|
||||||
if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
|
|
||||||
server_args.extend(['--log-format', "text"])
|
|
||||||
|
|
||||||
args = [str(arg) for arg in [context.server_path, *server_args]]
|
args = [str(arg) for arg in [context.server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
@ -15,10 +16,10 @@
|
|||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
|
#include <random>
|
||||||
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
|
||||||
#include <random>
|
|
||||||
|
|
||||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||||
|
|
||||||
@ -35,32 +36,6 @@ enum error_type {
|
|||||||
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
ERROR_TYPE_NOT_SUPPORTED, // custom error
|
||||||
};
|
};
|
||||||
|
|
||||||
extern bool server_verbose;
|
|
||||||
extern bool server_log_json;
|
|
||||||
|
|
||||||
#ifndef SERVER_VERBOSE
|
|
||||||
#define SERVER_VERBOSE 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if SERVER_VERBOSE != 1
|
|
||||||
#define LOG_VERBOSE(MSG, ...)
|
|
||||||
#else
|
|
||||||
#define LOG_VERBOSE(MSG, ...) \
|
|
||||||
do \
|
|
||||||
{ \
|
|
||||||
if (server_verbose) \
|
|
||||||
{ \
|
|
||||||
server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
|
|
||||||
} \
|
|
||||||
} while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
|
||||||
|
|
||||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||||
// Fallback null to default value
|
// Fallback null to default value
|
||||||
@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|||||||
try {
|
try {
|
||||||
return body.at(key);
|
return body.at(key);
|
||||||
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
||||||
std::stringstream ss;
|
LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
|
||||||
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
|
||||||
LOG_WARNING(ss.str().c_str(), body);
|
|
||||||
return default_value;
|
return default_value;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
|
|
||||||
std::stringstream ss_tid;
|
|
||||||
ss_tid << std::this_thread::get_id();
|
|
||||||
json log = json{
|
|
||||||
{"tid", ss_tid.str()},
|
|
||||||
{"timestamp", time(nullptr)},
|
|
||||||
};
|
|
||||||
|
|
||||||
if (server_log_json) {
|
|
||||||
log.merge_patch({
|
|
||||||
{"level", level},
|
|
||||||
{"function", function},
|
|
||||||
{"line", line},
|
|
||||||
{"msg", message},
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!extra.empty()) {
|
|
||||||
log.merge_patch(extra);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
|
|
||||||
} else {
|
|
||||||
char buf[1024];
|
|
||||||
snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
|
|
||||||
|
|
||||||
if (!extra.empty()) {
|
|
||||||
log.merge_patch(extra);
|
|
||||||
}
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << buf << " |";
|
|
||||||
for (const auto & el : log.items())
|
|
||||||
{
|
|
||||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
|
||||||
ss << " " << el.key() << "=" << value;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string str = ss.str();
|
|
||||||
printf("%.*s\n", (int)str.size(), str.data());
|
|
||||||
}
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// chat template utils
|
// chat template utils
|
||||||
//
|
//
|
||||||
@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
|||||||
chat.push_back({role, content});
|
chat.push_back({role, content});
|
||||||
}
|
}
|
||||||
|
|
||||||
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,10 +175,7 @@ static std::string random_string() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static std::string gen_chatcmplid() {
|
static std::string gen_chatcmplid() {
|
||||||
std::stringstream chatcmplid;
|
return "chatcmpl-" + random_string();
|
||||||
chatcmplid << "chatcmpl-" << random_string();
|
|
||||||
|
|
||||||
return chatcmplid.str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
|||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool json_is_array_of_numbers(json data) {
|
static bool json_is_array_of_numbers(const json & data) {
|
||||||
if (data.is_array()) {
|
if (data.is_array()) {
|
||||||
for (const auto & e : data) {
|
for (const auto & e : data) {
|
||||||
if (!e.is_number()) {
|
if (!e.is_number()) {
|
||||||
@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
|
static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
|
||||||
const std::string str =
|
const std::string str =
|
||||||
std::string(event) + ": " +
|
std::string(event) + ": " +
|
||||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
"\n\n";
|
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
|
||||||
|
|
||||||
LOG_VERBOSE("data stream", {
|
LOG_DBG("data stream, to_send: %s", str.c_str());
|
||||||
{ "to_send", str }
|
|
||||||
});
|
|
||||||
|
|
||||||
return sink.write(str.c_str(), str.size());
|
return sink.write(str.c_str(), str.size());
|
||||||
}
|
}
|
||||||
@ -425,7 +352,7 @@ static json oaicompat_completion_params_parse(
|
|||||||
|
|
||||||
// Params supported by OAI but unsupported by llama.cpp
|
// Params supported by OAI but unsupported by llama.cpp
|
||||||
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
|
||||||
for (auto & param : unsupported_params) {
|
for (const auto & param : unsupported_params) {
|
||||||
if (body.contains(param)) {
|
if (body.contains(param)) {
|
||||||
throw std::runtime_error("Unsupported param: " + param);
|
throw std::runtime_error("Unsupported param: " + param);
|
||||||
}
|
}
|
||||||
@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
|||||||
return llama_params;
|
return llama_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
|
static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
|
||||||
bool stopped_word = result.count("stopped_word") != 0;
|
bool stopped_word = result.count("stopped_word") != 0;
|
||||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
|||||||
{"id", completion_id}
|
{"id", completion_id}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (server_verbose) {
|
// extra fields for debugging purposes
|
||||||
|
if (verbose) {
|
||||||
res["__verbose"] = result;
|
res["__verbose"] = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
|
|||||||
}
|
}
|
||||||
|
|
||||||
// return value is vector as there is one case where we might need to generate two responses
|
// return value is vector as there is one case where we might need to generate two responses
|
||||||
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
|
static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
|
||||||
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||||
return std::vector<json>({result});
|
return std::vector<json>({result});
|
||||||
}
|
}
|
||||||
@ -595,7 +523,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
|
|||||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
||||||
json data = json::array();
|
json data = json::array();
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (auto & elem : embeddings) {
|
for (const auto & elem : embeddings) {
|
||||||
data.push_back(json{
|
data.push_back(json{
|
||||||
{"embedding", json_value(elem, "embedding", json::array())},
|
{"embedding", json_value(elem, "embedding", json::array())},
|
||||||
{"index", i++},
|
{"index", i++},
|
||||||
|
@ -1,16 +1,14 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG_TEE("\nexample usage:\n");
|
LOG("\nexample usage:\n");
|
||||||
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -23,6 +21,8 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
// total length of the sequence including the prompt
|
// total length of the sequence including the prompt
|
||||||
const int n_predict = params.n_predict;
|
const int n_predict = params.n_predict;
|
||||||
|
|
||||||
@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
|
|||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
||||||
|
|
||||||
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
LOG("\n");
|
||||||
|
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||||
if (n_kv_req > n_ctx) {
|
if (n_kv_req > n_ctx) {
|
||||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||||
LOG_TEE("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
// create a llama_batch with size 512
|
// create a llama_batch with size 512
|
||||||
// we use this object to submit token data for decoding
|
// we use this object to submit token data for decoding
|
||||||
|
|
||||||
@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
|
|||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG("%s: llama_decode() failed\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,16 +115,16 @@ int main(int argc, char ** argv) {
|
|||||||
while (n_cur <= n_predict) {
|
while (n_cur <= n_predict) {
|
||||||
// sample the next token
|
// sample the next token
|
||||||
{
|
{
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
@ -141,23 +140,23 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
// evaluate the current batch with the transformer model
|
||||||
if (llama_decode(ctx, batch)) {
|
if (llama_decode(ctx, batch)) {
|
||||||
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG("\n");
|
||||||
llama_perf_sampler_print(smpl);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
LOG("\n");
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
#include "arg.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
|
#include "log.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <random>
|
||||||
|
#include <set>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
|
||||||
#include <random>
|
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
@ -33,8 +36,10 @@ int main(int argc, char ** argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gpt_init();
|
||||||
|
|
||||||
if (params.model_draft.empty()) {
|
if (params.model_draft.empty()) {
|
||||||
fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
|
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,12 +52,6 @@ int main(int argc, char ** argv) {
|
|||||||
std::default_random_engine rng(params.sparams.seed);
|
std::default_random_engine rng(params.sparams.seed);
|
||||||
std::uniform_real_distribution<> u_dist;
|
std::uniform_real_distribution<> u_dist;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
|
||||||
log_set_target(log_filename_generator("speculative", "log"));
|
|
||||||
LOG_TEE("Log start\n");
|
|
||||||
log_dump_cmdline(argc, argv);
|
|
||||||
#endif // LOG_DISABLE_LOGS
|
|
||||||
|
|
||||||
// init llama.cpp
|
// init llama.cpp
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
@ -81,14 +80,14 @@ int main(int argc, char ** argv) {
|
|||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context;
|
||||||
|
|
||||||
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
||||||
LOG("vocab_type tgt: %d\n", vocab_type_tgt);
|
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
|
||||||
|
|
||||||
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
||||||
LOG("vocab_type dft: %d\n", vocab_type_dft);
|
LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
|
||||||
|
|
||||||
if (vocab_type_tgt != vocab_type_dft) {
|
if (vocab_type_tgt != vocab_type_dft) {
|
||||||
fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
|
LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||||
fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -98,7 +97,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
||||||
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
||||||
) {
|
) {
|
||||||
fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
|
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -110,8 +109,8 @@ int main(int argc, char ** argv) {
|
|||||||
: n_vocab_dft - n_vocab_tgt;
|
: n_vocab_dft - n_vocab_tgt;
|
||||||
|
|
||||||
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
||||||
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
|
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
||||||
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
||||||
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -120,8 +119,8 @@ int main(int argc, char ** argv) {
|
|||||||
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
||||||
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
||||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
|
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||||
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
|
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
||||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
llama_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
llama_token_to_piece(ctx_dft, i).c_str());
|
llama_token_to_piece(ctx_dft, i).c_str());
|
||||||
return 1;
|
return 1;
|
||||||
@ -138,18 +137,16 @@ int main(int argc, char ** argv) {
|
|||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
if ((int) inp.size() > max_tokens_list_size) {
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
|
|
||||||
const auto t_enc_start = ggml_time_us();
|
const auto t_enc_start = ggml_time_us();
|
||||||
@ -211,7 +208,7 @@ int main(int argc, char ** argv) {
|
|||||||
active_seqs.insert(s);
|
active_seqs.insert(s);
|
||||||
const auto & tokens = drafts[s].tokens;
|
const auto & tokens = drafts[s].tokens;
|
||||||
|
|
||||||
LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
|
LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
@ -254,7 +251,7 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
||||||
float r = u_dist(rng);
|
float r = u_dist(rng);
|
||||||
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
||||||
|
|
||||||
@ -272,7 +269,7 @@ int main(int argc, char ** argv) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
||||||
if (r <= p_tgt / p_dft) {
|
if (r <= p_tgt / p_dft) {
|
||||||
s_keep = s;
|
s_keep = s;
|
||||||
accept = true;
|
accept = true;
|
||||||
@ -280,10 +277,10 @@ int main(int argc, char ** argv) {
|
|||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
gpt_sampler_accept(smpl, token_id, true);
|
gpt_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||||
drafts[s].active = false;
|
drafts[s].active = false;
|
||||||
|
|
||||||
// calculate residual probability
|
// calculate residual probability
|
||||||
@ -338,7 +335,7 @@ int main(int argc, char ** argv) {
|
|||||||
if (!accept) {
|
if (!accept) {
|
||||||
// all drafted tokens were rejected
|
// all drafted tokens were rejected
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
LOG("all drafted tokens were rejected, sampling from residual distribution\n");
|
LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
|
||||||
std::vector<float> probs(dist_tgt.size);
|
std::vector<float> probs(dist_tgt.size);
|
||||||
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
||||||
probs[i] = dist_tgt.data[i].p;
|
probs[i] = dist_tgt.data[i].p;
|
||||||
@ -356,13 +353,11 @@ int main(int argc, char ** argv) {
|
|||||||
// greedy verification
|
// greedy verification
|
||||||
|
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, token_id, true);
|
gpt_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
|
|
||||||
|
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
@ -371,7 +366,7 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
||||||
LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
|
|
||||||
s_keep = s;
|
s_keep = s;
|
||||||
accept = true;
|
accept = true;
|
||||||
@ -393,26 +388,24 @@ int main(int argc, char ** argv) {
|
|||||||
++i_dft;
|
++i_dft;
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
// Color token according to its origin sequence
|
// Color token according to its origin sequence
|
||||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||||
} else {
|
} else {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
continue;
|
continue;
|
||||||
} else {
|
} else {
|
||||||
printf("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
fflush(stdout);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
||||||
|
|
||||||
// TODO: simplify
|
// TODO: simplify
|
||||||
{
|
{
|
||||||
LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
||||||
|
|
||||||
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
||||||
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
||||||
@ -439,7 +432,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
llama_decode(ctx_dft, batch_dft);
|
llama_decode(ctx_dft, batch_dft);
|
||||||
|
|
||||||
++n_past_dft;
|
++n_past_dft;
|
||||||
@ -486,7 +479,7 @@ int main(int argc, char ** argv) {
|
|||||||
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
||||||
|
|
||||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
||||||
LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -495,7 +488,7 @@ int main(int argc, char ** argv) {
|
|||||||
// attempt to split the branch if the probability is high enough
|
// attempt to split the branch if the probability is high enough
|
||||||
for (int f = 1; f < 8; ++f) {
|
for (int f = 1; f < 8; ++f) {
|
||||||
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
||||||
LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
||||||
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
||||||
@ -584,7 +577,7 @@ int main(int argc, char ** argv) {
|
|||||||
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
||||||
llama_decode(ctx_tgt, batch_tgt);
|
llama_decode(ctx_tgt, batch_tgt);
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
}
|
}
|
||||||
@ -602,23 +595,25 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
auto t_dec_end = ggml_time_us();
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_INF("\n");
|
||||||
LOG_TEE("n_draft = %d\n", n_draft);
|
LOG_INF("n_draft = %d\n", n_draft);
|
||||||
LOG_TEE("n_predict = %d\n", n_predict);
|
LOG_INF("n_predict = %d\n", n_predict);
|
||||||
LOG_TEE("n_drafted = %d\n", n_drafted);
|
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||||
LOG_TEE("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_TEE("\ndraft:\n\n");
|
LOG_INF("\n");
|
||||||
|
LOG_INF("draft:\n\n");
|
||||||
// TODO: print sampling/grammar timings for all drafts
|
// TODO: print sampling/grammar timings for all drafts
|
||||||
llama_perf_context_print(ctx_dft);
|
llama_perf_context_print(ctx_dft);
|
||||||
|
|
||||||
LOG_TEE("\ntarget:\n\n");
|
LOG_INF("\n");
|
||||||
|
LOG_INF("target:\n\n");
|
||||||
gpt_perf_print(ctx_tgt, smpl);
|
gpt_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
gpt_sampler_free(smpl);
|
||||||
@ -637,7 +632,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
fprintf(stderr, "\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
//#include "log.h" // TODO: start using log.h
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <iostream> // TODO: remove me
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
@ -13,25 +15,25 @@
|
|||||||
#include <shellapi.h> // For CommandLineToArgvW
|
#include <shellapi.h> // For CommandLineToArgvW
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void print_usage_information(const char * argv0, FILE * stream) {
|
static void print_usage_information(const char * argv0) {
|
||||||
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
printf("usage: %s [options]\n\n", argv0);
|
||||||
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
printf("The tokenize program tokenizes a prompt using a given model,\n");
|
||||||
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
printf("and prints the resulting tokens to standard output.\n\n");
|
||||||
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
printf("It needs a model file, a prompt, and optionally other flags\n");
|
||||||
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
printf("to control the behavior of the tokenizer.\n\n");
|
||||||
fprintf(stream, " The possible options are:\n");
|
printf(" The possible options are:\n");
|
||||||
fprintf(stream, "\n");
|
printf("\n");
|
||||||
fprintf(stream, " -h, --help print this help and exit\n");
|
printf(" -h, --help print this help and exit\n");
|
||||||
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||||
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||||
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||||
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
printf(" --stdin read prompt from standard input.\n");
|
||||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||||
fprintf(stream, " --no-parse-special do not parse control tokens.\n");
|
printf(" --no-parse-special do not parse control tokens.\n");
|
||||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||||
fprintf(stream, " --show-count print the total number of tokens.\n");
|
printf(" --show-count print the total number of tokens.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||||
@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
const int argc = argv.size();
|
const int argc = argv.size();
|
||||||
|
|
||||||
if (argc <= 1) {
|
if (argc <= 1) {
|
||||||
print_usage_information(argv[0].c_str(), stderr);
|
print_usage_information(argv[0].c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
for (; iarg < argc; ++iarg) {
|
for (; iarg < argc; ++iarg) {
|
||||||
std::string arg{argv[iarg]};
|
std::string arg{argv[iarg]};
|
||||||
if (arg == "-h" || arg == "--help") {
|
if (arg == "-h" || arg == "--help") {
|
||||||
print_usage_information(argv[0].c_str(), stdout);
|
print_usage_information(argv[0].c_str());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
else if (arg == "--ids") {
|
else if (arg == "--ids") {
|
||||||
@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
|
|||||||
// Start actually doing the tokenizing stuff.
|
// Start actually doing the tokenizing stuff.
|
||||||
//////
|
//////
|
||||||
|
|
||||||
#ifdef LOG_DISABLE_LOGS
|
|
||||||
disable_logging = true;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (disable_logging) {
|
if (disable_logging) {
|
||||||
llama_log_set(llama_log_callback_null, NULL);
|
llama_log_set(llama_log_callback_null, NULL);
|
||||||
}
|
}
|
||||||
|
@ -564,10 +564,11 @@ extern "C" {
|
|||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_log_level {
|
enum ggml_log_level {
|
||||||
GGML_LOG_LEVEL_ERROR = 2,
|
GGML_LOG_LEVEL_NONE = 0,
|
||||||
GGML_LOG_LEVEL_WARN = 3,
|
GGML_LOG_LEVEL_INFO = 1,
|
||||||
GGML_LOG_LEVEL_INFO = 4,
|
GGML_LOG_LEVEL_WARN = 2,
|
||||||
GGML_LOG_LEVEL_DEBUG = 5
|
GGML_LOG_LEVEL_ERROR = 3,
|
||||||
|
GGML_LOG_LEVEL_DEBUG = 4,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ggml_tensor_flag {
|
enum ggml_tensor_flag {
|
||||||
|
@ -13,13 +13,16 @@
|
|||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
#ifdef GGML_METAL_NDEBUG
|
#ifdef GGML_METAL_NDEBUG
|
||||||
|
#define GGML_METAL_LOG(...)
|
||||||
#define GGML_METAL_LOG_INFO(...)
|
#define GGML_METAL_LOG_INFO(...)
|
||||||
#define GGML_METAL_LOG_WARN(...)
|
#define GGML_METAL_LOG_WARN(...)
|
||||||
#define GGML_METAL_LOG_ERROR(...)
|
#define GGML_METAL_LOG_ERROR(...)
|
||||||
#else
|
#else
|
||||||
|
#define GGML_METAL_LOG(...) ggml_metal_log(GGML_LOG_LEVEL_NONE, __VA_ARGS__)
|
||||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||||
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
|||||||
#ifndef GGML_METAL_NDEBUG
|
#ifndef GGML_METAL_NDEBUG
|
||||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
|
GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
|
||||||
__func__,
|
__func__,
|
||||||
size_aligned / 1024.0 / 1024.0,
|
size_aligned / 1024.0 / 1024.0,
|
||||||
device.currentAllocatedSize / 1024.0 / 1024.0,
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
|
|||||||
|
|
||||||
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
} else {
|
|
||||||
GGML_METAL_LOG_INFO("\n");
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
||||||
|
@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|||||||
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
void llama_log_internal (ggml_log_level level, const char * format, ...);
|
||||||
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
|
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
@ -18074,9 +18074,9 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
unsigned percentage = (unsigned) (100 * progress);
|
unsigned percentage = (unsigned) (100 * progress);
|
||||||
while (percentage > *cur_percentage_p) {
|
while (percentage > *cur_percentage_p) {
|
||||||
*cur_percentage_p = percentage;
|
*cur_percentage_p = percentage;
|
||||||
LLAMA_LOG_INFO(".");
|
LLAMA_LOG(".");
|
||||||
if (percentage >= 100) {
|
if (percentage >= 100) {
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -20781,8 +20781,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
|
|||||||
if (len < 128) {
|
if (len < 128) {
|
||||||
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
||||||
} else {
|
} else {
|
||||||
char* buffer2 = new char[len+1];
|
char * buffer2 = new char[len + 1];
|
||||||
vsnprintf(buffer2, len+1, format, args_copy);
|
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||||
buffer2[len] = 0;
|
buffer2[len] = 0;
|
||||||
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
||||||
delete[] buffer2;
|
delete[] buffer2;
|
||||||
|
@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
|||||||
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||||
|
llama_target_and_test(test-log.cpp)
|
||||||
llama_target_and_test(test-arg-parser.cpp)
|
llama_target_and_test(test-arg-parser.cpp)
|
||||||
llama_target_and_test(test-quantize-fns.cpp)
|
llama_target_and_test(test-quantize-fns.cpp)
|
||||||
llama_target_and_test(test-quantize-perf.cpp)
|
llama_target_and_test(test-quantize-perf.cpp)
|
||||||
|
@ -85,7 +85,7 @@ int main(void) {
|
|||||||
|
|
||||||
argv = {"binary_name", "--verbose"};
|
argv = {"binary_name", "--verbose"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.verbosity == 1);
|
assert(params.verbosity > 1);
|
||||||
|
|
||||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
39
tests/test-log.cpp
Normal file
39
tests/test-log.cpp
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
const int n_thread = 8;
|
||||||
|
|
||||||
|
std::thread threads[n_thread];
|
||||||
|
for (int i = 0; i < n_thread; i++) {
|
||||||
|
threads[i] = std::thread([i]() {
|
||||||
|
const int n_msg = 1000;
|
||||||
|
|
||||||
|
for (int j = 0; j < n_msg; j++) {
|
||||||
|
const int log_type = std::rand() % 4;
|
||||||
|
|
||||||
|
switch (log_type) {
|
||||||
|
case 0: LOG_INF("Thread %d: %d\n", i, j); break;
|
||||||
|
case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
|
||||||
|
case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
|
||||||
|
case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rand () % 10 < 5) {
|
||||||
|
gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
|
||||||
|
gpt_log_set_prefix (gpt_log_main(), rand() % 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_thread; i++) {
|
||||||
|
threads[i].join();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user