mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
Compare commits
5 Commits
7bc4f63f2b
...
accf266be6
Author | SHA1 | Date | |
---|---|---|---|
|
accf266be6 | ||
|
30caac3a68 | ||
|
60cfa728e2 | ||
|
3327bb0f8d | ||
|
a2d4b6fc81 |
@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
|
|||||||
# write the shared library to the output directory
|
# write the shared library to the output directory
|
||||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||||
|
add_dependencies(ggml ${backend})
|
||||||
else()
|
else()
|
||||||
add_library(${backend} ${ARGN})
|
add_library(${backend} ${ARGN})
|
||||||
target_link_libraries(ggml PUBLIC ${backend})
|
target_link_libraries(ggml PUBLIC ${backend})
|
||||||
|
@ -66,6 +66,26 @@
|
|||||||
#include "ggml-kompute.h"
|
#include "ggml-kompute.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic push
|
||||||
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.from_bytes(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||||
|
return converter.to_bytes(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(__clang__)
|
||||||
|
# pragma clang diagnostic pop
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
||||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||||
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
|||||||
return handle;
|
return handle;
|
||||||
}
|
}
|
||||||
|
|
||||||
static dl_handle * dl_load_library(const std::string & path) {
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
|
||||||
return dl_load_library(converter.from_bytes(path));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||||
@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void * dl_load_library(const std::string & path) {
|
static void * dl_load_library(const std::wstring & path) {
|
||||||
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||||
|
|
||||||
return handle;
|
return handle;
|
||||||
}
|
}
|
||||||
@ -202,11 +217,11 @@ struct ggml_backend_registry {
|
|||||||
devices.push_back(device);
|
devices.push_back(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||||
dl_handle_ptr handle { dl_load_library(path) };
|
dl_handle_ptr handle { dl_load_library(path) };
|
||||||
if (!handle) {
|
if (!handle) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -214,7 +229,7 @@ struct ggml_backend_registry {
|
|||||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
if (score_fn && score_fn() == 0) {
|
if (score_fn && score_fn() == 0) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -222,7 +237,7 @@ struct ggml_backend_registry {
|
|||||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||||
if (!backend_init_fn) {
|
if (!backend_init_fn) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -231,16 +246,16 @@ struct ggml_backend_registry {
|
|||||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
if (!reg) {
|
if (!reg) {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||||
} else {
|
} else {
|
||||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||||
|
|
||||||
register_backend(reg, std::move(handle));
|
register_backend(reg, std::move(handle));
|
||||||
|
|
||||||
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|||||||
|
|
||||||
// Dynamic loading
|
// Dynamic loading
|
||||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||||
return get_reg().load_backend(path, false);
|
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||||
get_reg().unload_backend(reg, true);
|
get_reg().unload_backend(reg, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string get_executable_path() {
|
static std::wstring get_executable_path() {
|
||||||
#if defined(__APPLE__)
|
#if defined(__APPLE__)
|
||||||
// get executable path
|
// get executable path
|
||||||
std::vector<char> path;
|
std::vector<char> path;
|
||||||
@ -401,7 +416,7 @@ static std::string get_executable_path() {
|
|||||||
if (last_slash != std::string::npos) {
|
if (last_slash != std::string::npos) {
|
||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
return base_path + "/";
|
return utf8_to_utf16(base_path + "/");
|
||||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||||
std::string base_path = ".";
|
std::string base_path = ".";
|
||||||
std::vector<char> path(1024);
|
std::vector<char> path(1024);
|
||||||
@ -427,57 +442,63 @@ static std::string get_executable_path() {
|
|||||||
path.resize(path.size() * 2);
|
path.resize(path.size() * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
return base_path + "/";
|
return utf8_to_utf16(base_path + "/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
std::vector<char> path(MAX_PATH);
|
std::vector<wchar_t> path(MAX_PATH);
|
||||||
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
return "";
|
return {};
|
||||||
}
|
}
|
||||||
std::string base_path(path.data(), len);
|
std::wstring base_path(path.data(), len);
|
||||||
// remove executable name
|
// remove executable name
|
||||||
auto last_slash = base_path.find_last_of('\\');
|
auto last_slash = base_path.find_last_of('\\');
|
||||||
if (last_slash != std::string::npos) {
|
if (last_slash != std::string::npos) {
|
||||||
base_path = base_path.substr(0, last_slash);
|
base_path = base_path.substr(0, last_slash);
|
||||||
}
|
}
|
||||||
return base_path + "\\";
|
return base_path + L"\\";
|
||||||
|
#else
|
||||||
|
return {};
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string backend_filename_prefix() {
|
static std::wstring backend_filename_prefix() {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return "ggml-";
|
return L"ggml-";
|
||||||
#else
|
#else
|
||||||
return "libggml-";
|
return L"libggml-";
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string backend_filename_suffix() {
|
static std::wstring backend_filename_suffix() {
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
return ".dll";
|
return L".dll";
|
||||||
#else
|
#else
|
||||||
return ".so";
|
return L".so";
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::wstring path_separator() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
return L"\\";
|
||||||
|
#else
|
||||||
|
return L"/";
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||||
// TODO: search system paths
|
// TODO: search system paths
|
||||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||||
std::vector<std::string> search_paths;
|
std::vector<std::wstring> search_paths;
|
||||||
if (user_search_path == nullptr) {
|
if (user_search_path == nullptr) {
|
||||||
search_paths.push_back("./");
|
search_paths.push_back(L"." + path_separator());
|
||||||
search_paths.push_back(get_executable_path());
|
search_paths.push_back(get_executable_path());
|
||||||
} else {
|
} else {
|
||||||
#if defined(_WIN32)
|
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||||
search_paths.push_back(std::string(user_search_path) + "\\");
|
|
||||||
#else
|
|
||||||
search_paths.push_back(std::string(user_search_path) + "/");
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int best_score = 0;
|
int best_score = 0;
|
||||||
std::string best_path;
|
std::wstring best_path;
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
@ -487,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|||||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||||
for (const auto & entry : dir_it) {
|
for (const auto & entry : dir_it) {
|
||||||
if (entry.is_regular_file()) {
|
if (entry.is_regular_file()) {
|
||||||
std::string filename = entry.path().filename().string();
|
std::wstring filename = entry.path().filename().wstring();
|
||||||
std::string ext = entry.path().extension().string();
|
std::wstring ext = entry.path().extension().wstring();
|
||||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||||
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||||
if (!handle && !silent) {
|
if (!handle && !silent) {
|
||||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||||
}
|
}
|
||||||
if (handle) {
|
if (handle) {
|
||||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||||
if (score_fn) {
|
if (score_fn) {
|
||||||
int s = score_fn();
|
int s = score_fn();
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||||
#endif
|
#endif
|
||||||
if (s > best_score) {
|
if (s > best_score) {
|
||||||
best_score = s;
|
best_score = s;
|
||||||
best_path = entry.path().string();
|
best_path = entry.path().wstring();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!silent) {
|
if (!silent) {
|
||||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -519,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
|||||||
if (best_score == 0) {
|
if (best_score == 0) {
|
||||||
// try to load the base backend
|
// try to load the base backend
|
||||||
for (const auto & search_path : search_paths) {
|
for (const auto & search_path : search_paths) {
|
||||||
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||||
if (fs::exists(path)) {
|
if (fs::exists(path)) {
|
||||||
return get_reg().load_backend(path.c_str(), silent);
|
return get_reg().load_backend(path, silent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_reg().load_backend(best_path.c_str(), silent);
|
return get_reg().load_backend(best_path, silent);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_load_all() {
|
void ggml_backend_load_all() {
|
||||||
|
@ -135,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# show enabled features
|
# show enabled features
|
||||||
|
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
|
||||||
|
set(FEAT_INPUT_FILE "NUL")
|
||||||
|
else()
|
||||||
|
set(FEAT_INPUT_FILE "/dev/null")
|
||||||
|
endif()
|
||||||
|
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
||||||
INPUT_FILE "/dev/null"
|
INPUT_FILE ${FEAT_INPUT_FILE}
|
||||||
OUTPUT_VARIABLE ARM_FEATURE
|
OUTPUT_VARIABLE ARM_FEATURE
|
||||||
RESULT_VARIABLE ARM_FEATURE_RESULT
|
RESULT_VARIABLE ARM_FEATURE_RESULT
|
||||||
)
|
)
|
||||||
if (ARM_FEATURE_RESULT)
|
if (ARM_FEATURE_RESULT)
|
||||||
message(FATAL_ERROR "Failed to get ARM features")
|
message(WARNING "Failed to get ARM features")
|
||||||
else()
|
else()
|
||||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
||||||
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
||||||
@ -317,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|||||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||||
|
|
||||||
if (GGML_BACKEND_DL)
|
if (GGML_BACKEND_DL)
|
||||||
|
if (GGML_NATIVE)
|
||||||
|
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||||
|
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||||
|
endif()
|
||||||
|
|
||||||
# The feature detection code is compiled as a separate target so that
|
# The feature detection code is compiled as a separate target so that
|
||||||
# it can be built without the architecture flags
|
# it can be built without the architecture flags
|
||||||
# Since multiple variants of the CPU backend may be included in the same
|
# Since multiple variants of the CPU backend may be included in the same
|
||||||
|
@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
||||||
return vocab.special_bos_id;
|
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
||||||
|
@ -45,7 +45,7 @@ struct llama_vocab {
|
|||||||
id special_unk_id = 0;
|
id special_unk_id = 0;
|
||||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||||
id special_cls_id = LLAMA_TOKEN_NULL;
|
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||||
|
|
||||||
id linefeed_id = 13;
|
id linefeed_id = 13;
|
||||||
|
@ -3053,6 +3053,13 @@ struct llama_kv_cache {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// block of KV slots to move when defragging
|
||||||
|
struct llama_kv_defrag_move {
|
||||||
|
uint32_t src;
|
||||||
|
uint32_t dst;
|
||||||
|
uint32_t len;
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_control_vector {
|
struct llama_control_vector {
|
||||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
@ -10990,35 +10997,23 @@ struct llm_build_context {
|
|||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ids.size(); ++i) {
|
for (const auto & move : moves) {
|
||||||
const uint32_t id = ids[i];
|
|
||||||
|
|
||||||
if (i == id || id == ids.size()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t nm = 1;
|
|
||||||
|
|
||||||
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
|
||||||
nm++;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||||
|
|
||||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa, move.len,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
|
||||||
|
|
||||||
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||||
n_embd_k_gqa, nm,
|
n_embd_k_gqa, move.len,
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
|
||||||
|
|
||||||
ggml_tensor * view_v_src;
|
ggml_tensor * view_v_src;
|
||||||
ggml_tensor * view_v_dst;
|
ggml_tensor * view_v_dst;
|
||||||
@ -11026,31 +11021,29 @@ struct llm_build_context {
|
|||||||
if (flash_attn) {
|
if (flash_attn) {
|
||||||
// NOTE: the V cache is not transposed when using flash attention
|
// NOTE: the V cache is not transposed when using flash attention
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa, move.len,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
n_embd_v_gqa, nm,
|
n_embd_v_gqa, move.len,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
|
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
|
||||||
} else {
|
} else {
|
||||||
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
move.len, n_embd_v_gqa,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, i));
|
ggml_row_size(kv_self.v_l[il]->type, move.src));
|
||||||
|
|
||||||
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
||||||
nm, n_embd_v_gqa,
|
move.len, n_embd_v_gqa,
|
||||||
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
||||||
ggml_row_size(kv_self.v_l[il]->type, id));
|
ggml_row_size(kv_self.v_l[il]->type, move.dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
||||||
}
|
}
|
||||||
|
|
||||||
i += nm - 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
||||||
@ -17601,7 +17594,7 @@ struct llm_build_context {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
|
||||||
llama_ubatch dummy = {};
|
llama_ubatch dummy = {};
|
||||||
dummy.equal_seqs = true;
|
dummy.equal_seqs = true;
|
||||||
|
|
||||||
@ -17611,7 +17604,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
|
|||||||
|
|
||||||
llm.init();
|
llm.init();
|
||||||
|
|
||||||
struct ggml_cgraph * result = llm.build_defrag(ids);
|
struct ggml_cgraph * result = llm.build_defrag(moves);
|
||||||
|
|
||||||
llm.free();
|
llm.free();
|
||||||
|
|
||||||
@ -18627,7 +18620,12 @@ static int llama_decode_internal(
|
|||||||
kv_self.head = 0;
|
kv_self.head = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||||
|
if (!slot) {
|
||||||
|
llama_kv_cache_defrag(kv_self);
|
||||||
|
llama_kv_cache_update(&lctx);
|
||||||
|
slot = llama_kv_cache_find_slot(kv_self, ubatch);
|
||||||
|
}
|
||||||
if (!slot) {
|
if (!slot) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -19030,8 +19028,8 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
|
|
||||||
//const int64_t t_start = ggml_time_us();
|
//const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
// number of cells moved
|
// groups of cells moved
|
||||||
uint32_t n_moves = 0;
|
std::vector<struct llama_kv_defrag_move> moves;
|
||||||
|
|
||||||
// each move requires 6*n_layer tensors (see build_defrag)
|
// each move requires 6*n_layer tensors (see build_defrag)
|
||||||
// - source view, destination view, copy operation
|
// - source view, destination view, copy operation
|
||||||
@ -19095,19 +19093,11 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
// are we moving a continuous block of memory?
|
// are we moving a continuous block of memory?
|
||||||
bool cont = false;
|
bool cont = false;
|
||||||
|
|
||||||
// should we stop searching for the next move?
|
|
||||||
bool stop = false;
|
|
||||||
|
|
||||||
// go back and move the nf cells to the hole
|
// go back and move the nf cells to the hole
|
||||||
for (; i1 < n_kv; ++i1) {
|
for (; i1 < n_kv; ++i1) {
|
||||||
auto & cell1 = kv_self.cells[i1];
|
auto & cell1 = kv_self.cells[i1];
|
||||||
|
|
||||||
if (cell1.is_empty() || ids[i1] != n_kv) {
|
if (cell1.is_empty() || ids[i1] != n_kv) {
|
||||||
if (n_moves == max_moves) {
|
|
||||||
stop = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
cont = false;
|
cont = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -19123,8 +19113,10 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
kv_self.head = n_used;
|
kv_self.head = n_used;
|
||||||
|
|
||||||
if (!cont) {
|
if (!cont) {
|
||||||
n_moves++;
|
moves.push_back({i1, i0 + nf, 1});
|
||||||
cont = true;
|
cont = true;
|
||||||
|
} else {
|
||||||
|
moves.back().len++;
|
||||||
}
|
}
|
||||||
|
|
||||||
nf++;
|
nf++;
|
||||||
@ -19134,22 +19126,16 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stop || n_moves == max_moves) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
||||||
|
|
||||||
i0 += nh - 1;
|
i0 += nh - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_moves == 0) {
|
if (moves.size() == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
|
||||||
|
|
||||||
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
// CPU defrag
|
// CPU defrag
|
||||||
@ -19224,11 +19210,18 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|||||||
#else
|
#else
|
||||||
// ggml_graph defrag
|
// ggml_graph defrag
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < moves.size(); i += max_moves) {
|
||||||
|
std::vector<struct llama_kv_defrag_move> chunk;
|
||||||
|
auto end = std::min(i + max_moves, moves.size());
|
||||||
|
chunk.assign(moves.begin() + i, moves.begin() + end);
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched.get());
|
ggml_backend_sched_reset(lctx.sched.get());
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
|
||||||
|
ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//const int64_t t_end = ggml_time_us();
|
//const int64_t t_end = ggml_time_us();
|
||||||
|
Loading…
Reference in New Issue
Block a user