Compare commits

...

7 Commits

Author SHA1 Message Date
Yann Follet
6750319799
Merge 14f64dab74 into 30caac3a68 2024-12-24 09:26:47 +01:00
Georgi Gerganov
30caac3a68
llama : the WPM vocabs use the CLS token as BOS (#10930)
* llama : the WPM vocabs use the CLS token as BOS

ggml-ci

* llama : add comment
2024-12-24 09:44:20 +02:00
Diego Devesa
60cfa728e2
ggml : use wstring for backend search paths (#10960)
ggml-ci
2024-12-24 04:05:27 +01:00
Diego Devesa
3327bb0f8d
ggml : fix arm enabled features check (#10961) 2024-12-24 04:05:17 +01:00
Yann Follet
14f64dab74
Merge branch 'ggerganov:master' into cuda-build-doc 2024-12-12 17:15:04 +08:00
Yann Follet
f70b5148e1 Merge branch 'cuda-build-doc' of https://github.com/YannFollet/llama.cpp into cuda-build-doc 2024-12-10 01:35:13 +00:00
Yann Follet
de1bb5a4ad Add build command for CUDA with path example 2024-12-10 01:32:10 +00:00
6 changed files with 91 additions and 52 deletions

View File

@ -134,6 +134,12 @@ This provides GPU acceleration using an NVIDIA GPU. Make sure to have the CUDA t
cmake --build build --config Release cmake --build build --config Release
``` ```
- Using `CMake` with path :
```bash
rm -rf build & /usr/local/bin/cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
/usr/local/bin/cmake --build build --config Release -j
```
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`. The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. In Windows this setting is available in the NVIDIA control panel as `System Memory Fallback`.

View File

@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
# write the shared library to the output directory # write the shared library to the output directory
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
else() else()
add_library(${backend} ${ARGN}) add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend}) target_link_libraries(ggml PUBLIC ${backend})

View File

@ -66,6 +66,26 @@
#include "ggml-kompute.h" #include "ggml-kompute.h"
#endif #endif
// disable C++17 deprecation warning for std::codecvt_utf8
#if defined(__clang__)
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
static std::wstring utf8_to_utf16(const std::string & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.from_bytes(str);
}
static std::string utf16_to_utf8(const std::wstring & str) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return converter.to_bytes(str);
}
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#ifdef _WIN32 #ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>; using dl_handle = std::remove_pointer_t<HMODULE>;
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
return handle; return handle;
} }
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) { static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@ -114,8 +129,8 @@ struct dl_handle_deleter {
} }
}; };
static void * dl_load_library(const std::string & path) { static void * dl_load_library(const std::wstring & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
return handle; return handle;
} }
@ -202,11 +217,11 @@ struct ggml_backend_registry {
devices.push_back(device); devices.push_back(device);
} }
ggml_backend_reg_t load_backend(const char * path, bool silent) { ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) }; dl_handle_ptr handle { dl_load_library(path) };
if (!handle) { if (!handle) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
@ -214,7 +229,7 @@ struct ggml_backend_registry {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) { if (score_fn && score_fn() == 0) {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path); GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
@ -222,7 +237,7 @@ struct ggml_backend_registry {
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) { if (!backend_init_fn) {
if (!silent) { if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path); GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
} }
return nullptr; return nullptr;
} }
@ -231,16 +246,16 @@ struct ggml_backend_registry {
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) { if (!silent) {
if (!reg) { if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
} else { } else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION); __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
} }
} }
return nullptr; return nullptr;
} }
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
register_backend(reg, std::move(handle)); register_backend(reg, std::move(handle));
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
// Dynamic loading // Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) { ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false); return get_reg().load_backend(utf8_to_utf16(path), false);
} }
void ggml_backend_unload(ggml_backend_reg_t reg) { void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true); get_reg().unload_backend(reg, true);
} }
static std::string get_executable_path() { static std::wstring get_executable_path() {
#if defined(__APPLE__) #if defined(__APPLE__)
// get executable path // get executable path
std::vector<char> path; std::vector<char> path;
@ -401,7 +416,7 @@ static std::string get_executable_path() {
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return base_path + "/"; return utf8_to_utf16(base_path + "/");
#elif defined(__linux__) || defined(__FreeBSD__) #elif defined(__linux__) || defined(__FreeBSD__)
std::string base_path = "."; std::string base_path = ".";
std::vector<char> path(1024); std::vector<char> path(1024);
@ -427,57 +442,63 @@ static std::string get_executable_path() {
path.resize(path.size() * 2); path.resize(path.size() * 2);
} }
return base_path + "/"; return utf8_to_utf16(base_path + "/");
#elif defined(_WIN32) #elif defined(_WIN32)
std::vector<char> path(MAX_PATH); std::vector<wchar_t> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size()); DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
if (len == 0) { if (len == 0) {
return ""; return {};
} }
std::string base_path(path.data(), len); std::wstring base_path(path.data(), len);
// remove executable name // remove executable name
auto last_slash = base_path.find_last_of('\\'); auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) { if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash); base_path = base_path.substr(0, last_slash);
} }
return base_path + "\\"; return base_path + L"\\";
#else
return {};
#endif #endif
} }
static std::string backend_filename_prefix() { static std::wstring backend_filename_prefix() {
#ifdef _WIN32 #ifdef _WIN32
return "ggml-"; return L"ggml-";
#else #else
return "libggml-"; return L"libggml-";
#endif #endif
} }
static std::string backend_filename_suffix() { static std::wstring backend_filename_suffix() {
#ifdef _WIN32 #ifdef _WIN32
return ".dll"; return L".dll";
#else #else
return ".so"; return L".so";
#endif
}
static std::wstring path_separator() {
#ifdef _WIN32
return L"\\";
#else
return L"/";
#endif #endif
} }
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths // TODO: search system paths
std::string file_prefix = backend_filename_prefix() + name + "-"; std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
std::vector<std::string> search_paths; std::vector<std::wstring> search_paths;
if (user_search_path == nullptr) { if (user_search_path == nullptr) {
search_paths.push_back("./"); search_paths.push_back(L"." + path_separator());
search_paths.push_back(get_executable_path()); search_paths.push_back(get_executable_path());
} else { } else {
#if defined(_WIN32) search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
search_paths.push_back(std::string(user_search_path) + "\\");
#else
search_paths.push_back(std::string(user_search_path) + "/");
#endif
} }
int best_score = 0; int best_score = 0;
std::string best_path; std::wstring best_path;
namespace fs = std::filesystem; namespace fs = std::filesystem;
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
@ -487,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
for (const auto & entry : dir_it) { for (const auto & entry : dir_it) {
if (entry.is_regular_file()) { if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string(); std::wstring filename = entry.path().filename().wstring();
std::string ext = entry.path().extension().string(); std::wstring ext = entry.path().extension().wstring();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) }; dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
if (!handle && !silent) { if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str()); GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
} }
if (handle) { if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) { if (score_fn) {
int s = score_fn(); int s = score_fn();
#ifndef NDEBUG #ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s); GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
#endif #endif
if (s > best_score) { if (s > best_score) {
best_score = s; best_score = s;
best_path = entry.path().string(); best_path = entry.path().wstring();
} }
} else { } else {
if (!silent) { if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
} }
} }
} }
@ -519,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
if (best_score == 0) { if (best_score == 0) {
// try to load the base backend // try to load the base backend
for (const auto & search_path : search_paths) { for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix(); std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
if (fs::exists(path)) { if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent); return get_reg().load_backend(path, silent);
} }
} }
return nullptr; return nullptr;
} }
return get_reg().load_backend(best_path.c_str(), silent); return get_reg().load_backend(best_path, silent);
} }
void ggml_backend_load_all() { void ggml_backend_load_all() {

View File

@ -135,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif() endif()
# show enabled features # show enabled features
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
set(FEAT_INPUT_FILE "NUL")
else()
set(FEAT_INPUT_FILE "/dev/null")
endif()
execute_process( execute_process(
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E - COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
INPUT_FILE "/dev/null" INPUT_FILE ${FEAT_INPUT_FILE}
OUTPUT_VARIABLE ARM_FEATURE OUTPUT_VARIABLE ARM_FEATURE
RESULT_VARIABLE ARM_FEATURE_RESULT RESULT_VARIABLE ARM_FEATURE_RESULT
) )
if (ARM_FEATURE_RESULT) if (ARM_FEATURE_RESULT)
message(FATAL_ERROR "Failed to get ARM features") message(WARNING "Failed to get ARM features")
else() else()
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC) foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos) string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
@ -317,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
if (GGML_BACKEND_DL) if (GGML_BACKEND_DL)
if (GGML_NATIVE)
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
endif()
# The feature detection code is compiled as a separate target so that # The feature detection code is compiled as a separate target so that
# it can be built without the architecture flags # it can be built without the architecture flags
# Since multiple variants of the CPU backend may be included in the same # Since multiple variants of the CPU backend may be included in the same

View File

@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
} }
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) { llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
return vocab.special_bos_id; return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
} }
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {

View File

@ -45,7 +45,7 @@ struct llama_vocab {
id special_unk_id = 0; id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL; id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL; id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
id special_mask_id = LLAMA_TOKEN_NULL; id special_mask_id = LLAMA_TOKEN_NULL;
id linefeed_id = 13; id linefeed_id = 13;