ggml : add support for dynamic loading of backends

This commit is contained in:
slaren 2024-11-24 00:00:52 +01:00
parent 55ed008b2d
commit d5a3beb0e0
29 changed files with 422 additions and 144 deletions

View File

@ -377,6 +377,9 @@ void common_init() {
#endif
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
// load dynamic backends
ggml_backend_load_all();
}
std::string common_params_get_system_info(const common_params & params) {

View File

@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(cvector-generator)
add_subdirectory(batched-bench)
add_subdirectory(batched)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(eval-callback)
add_subdirectory(export-lora)
add_subdirectory(gbnf-validator)
add_subdirectory(gguf-hash)
add_subdirectory(gguf-split)
@ -27,28 +24,34 @@ else()
add_subdirectory(imatrix)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(passkey)
add_subdirectory(perplexity)
add_subdirectory(quantize-stats)
add_subdirectory(quantize)
add_subdirectory(retrieval)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (LLAMA_BUILD_SERVER)
add_subdirectory(server)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
add_subdirectory(server)
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(tokenize)
if (NOT GGML_BACKEND_DL)
# these examples use the backends directly and cannot be built with dynamic loading
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(cvector-generator)
add_subdirectory(export-lora)
add_subdirectory(quantize-stats)
add_subdirectory(llava)
if (GGML_RPC)
add_subdirectory(rpc)
endif()
if (GGML_SYCL)
add_subdirectory(sycl)
endif()
endif()
endif()

View File

@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
cmd_params params = parse_cmd_params(argc, argv);
// initialize backends
ggml_backend_load_all();
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!cpu_dev) {
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
return 1;
}
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
// initialize llama.cpp
if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL);
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
ggml_threadpool_free(threadpool);
ggml_threadpool_free_fn(threadpool);
}
llama_free_model(lmodel);

View File

@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
struct ggml_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_threadpool_new(&tpp_batch);
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
if (!threadpool_batch) {
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
return 1;
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
tpp.paused = true;
}
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
if (!threadpool) {
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
return 1;
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
llama_backend_free();
ggml_threadpool_free(threadpool);
ggml_threadpool_free(threadpool_batch);
ggml_threadpool_free_fn(threadpool);
ggml_threadpool_free_fn(threadpool_batch);
return 0;
}

View File

@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
}
}, nullptr);
// load dynamic backends
ggml_backend_load_all();
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;

View File

@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
}
}
// load dynamic backends
ggml_backend_load_all();
// initialize the model
llama_model_params model_params = llama_model_default_params();

View File

@ -33,6 +33,7 @@ else()
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
#
# option list

View File

@ -190,6 +190,14 @@ extern "C" {
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
struct ggml_backend_feature {
const char * name;
const char * value;
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
//
// Backend registry
@ -214,6 +222,11 @@ extern "C" {
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
// Load a backend from a dynamic library
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
//
// Backend scheduler
//

View File

@ -7,29 +7,6 @@
extern "C" {
#endif
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
@ -75,14 +52,11 @@ extern "C" {
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
@ -104,10 +78,10 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);

View File

@ -2215,6 +2215,37 @@ extern "C" {
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
// ggml threadpool
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
// the goal should be to create an API that other backends can use move everything to the ggml base
// scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
#ifdef __cplusplus
}
#endif

View File

@ -202,6 +202,10 @@ endif()
# ggml
if (GGML_BACKEND_DL)
add_compile_definitions(GGML_BACKEND_DL)
endif()
add_library(ggml-base
../include/ggml.h
../include/ggml-alloc.h
@ -239,11 +243,18 @@ function(ggml_add_backend backend)
if (${BUILD_SHARED_LIBS})
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
if (GGML_BACKEND_DL)
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL)
endif()
endif()
if (GGML_BACKEND_DL)
install(TARGETS ${backend_target} RUNTIME)
else()
install(TARGETS ${backend_target} LIBRARY)
target_link_libraries(ggml PUBLIC ${backend_target})
string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use})
endif()
install(TARGETS ${backend_target} LIBRARY)
target_link_libraries(ggml PUBLIC ${backend_target})
string(TOUPPER "GGML_USE_${backend}" backend_use)
target_compile_definitions(ggml PUBLIC ${backend_use})
endif()
endif()
endfunction()

View File

@ -444,3 +444,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
}
#endif
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)

View File

@ -204,12 +204,30 @@ extern "C" {
void * context;
};
// Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
// Add backend dynamic loading support to the backend
#ifdef GGML_BACKEND_DL
#ifdef __cplusplus
# define GGML_BACKEND_DL_IMPL(reg_fn) \
extern "C" { \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
} \
ggml_backend_reg_t ggml_backend_init() { \
return reg_fn(); \
}
#else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
ggml_backend_reg_t ggml_backend_init() { \
return reg_fn(); \
}
#endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
#endif
#ifdef __cplusplus
}

View File

@ -1,11 +1,13 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include <cstring>
#include <vector>
// Backend registry
#ifdef GGML_USE_CPU
#include "ggml-cpu.h"
#endif
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
@ -75,8 +77,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
}
void register_backend(ggml_backend_reg_t reg) {
@ -193,3 +196,86 @@ ggml_backend_t ggml_backend_init_best(void) {
}
return ggml_backend_dev_init(dev, NULL);
}
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#else
# include <dlfcn.h>
#endif
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
ggml_backend_reg_t ggml_backend_load(const char * path) {
#ifdef _WIN32
HMODULE handle = LoadLibraryA(path);
if (!handle) {
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
return NULL;
}
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
if (!backend_init) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
FreeLibrary(handle);
return NULL;
}
ggml_backend_reg_t reg = backend_init();
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
FreeLibrary(handle);
return NULL;
}
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
ggml_backend_register(reg);
return reg;
#else
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
if (!handle) {
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
return NULL;
}
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
if (!backend_init) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
dlclose(handle);
return NULL;
}
ggml_backend_reg_t reg = backend_init();
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
dlclose(handle);
return NULL;
}
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
ggml_backend_register(reg);
return reg;
#endif
}
void ggml_backend_load_all() {
#ifdef _WIN32
#define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll"
#elif defined(__APPLE__)
// path is hardcoded to the cmake build directory for now
// FIXME: should also search default system paths
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib"
#else
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so"
#endif
ggml_backend_load(GGML_BACKEND_PATH("amx"));
ggml_backend_load(GGML_BACKEND_PATH("blas"));
ggml_backend_load(GGML_BACKEND_PATH("cann"));
ggml_backend_load(GGML_BACKEND_PATH("cuda"));
ggml_backend_load(GGML_BACKEND_PATH("hip"));
ggml_backend_load(GGML_BACKEND_PATH("kompute"));
ggml_backend_load(GGML_BACKEND_PATH("metal"));
ggml_backend_load(GGML_BACKEND_PATH("rpc"));
ggml_backend_load(GGML_BACKEND_PATH("sycl"));
ggml_backend_load(GGML_BACKEND_PATH("vulkan"));
ggml_backend_load(GGML_BACKEND_PATH("musa"));
ggml_backend_load(GGML_BACKEND_PATH("cpu"));
}

View File

@ -512,3 +512,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
return &ggml_backend_blas_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)

View File

@ -2126,3 +2126,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
}
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)

View File

@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
#endif // GGML_USE_OPENMP
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}
static struct ggml_threadpool * ggml_threadpool_new_impl(
struct ggml_threadpool_params * tpp,
struct ggml_cgraph * cgraph,

View File

@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
return &ggml_backend_cpu_device;
}
struct ggml_backend_feature {
const char * name;
const char * value;
};
// Not used yet
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
// and additionally to allow other backends to expose their own list of features that applications can query using the same API
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
ggml_cpu_init();
std::vector<ggml_backend_feature> features;
if (ggml_cpu_has_sse3()) {
features.push_back({ "SSE3", "1" });
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_avx()) {
features.push_back({ "AVX", "1" });
}
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx2()) {
features.push_back({ "AVX2", "1" });
}
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_fma()) {
features.push_back({ "FMA", "1" });
}
if (ggml_cpu_has_avx_vnni()) {
features.push_back({ "AVX_VNNI", "1" });
}
if (ggml_cpu_has_avx512()) {
features.push_back({ "AVX512", "1" });
}
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_llamafile()) {
features.push_back({ "LLAMAFILE", "1" });
}
// TODO: rename this
#ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" });
#endif
features.push_back({ nullptr, nullptr });
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cpu_get_features;
}
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
return (void *)ggml_backend_cpu_set_abort_callback;
}
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
return (void *)ggml_numa_init;
}
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
return (void *)ggml_is_numa;
}
// threadpool - TODO: move to ggml-base
if (strcmp(name, "ggml_threadpool_new") == 0) {
return (void *)ggml_threadpool_new;
}
if (strcmp(name, "ggml_threadpool_free") == 0) {
return (void *)ggml_threadpool_free;
}
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
return (void *)ggml_backend_cpu_set_threadpool;
}
return NULL;
@ -661,3 +684,5 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
return &ggml_backend_cpu_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)

View File

@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
return ctx->devices[index];
}
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
static std::vector<ggml_backend_feature> features = []() {
std::vector<ggml_backend_feature> features;
#define _STRINGIFY(...) #__VA_ARGS__
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
#ifdef __CUDA_ARCH_LIST__
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
#endif
#ifdef GGML_CUDA_FORCE_MMQ
features.push_back({ "FORCE_MMQ", "1" });
#endif
#ifdef GGML_CUDA_FORCE_CUBLAS
features.push_back({ "FORCE_CUBLAS", "1" });
#endif
#ifdef GGML_CUDA_NO_VMM
features.push_back({ "NO_VMM", "1" });
#endif
#ifdef GGML_CUDA_NO_PEER_COPY
features.push_back({ "NO_PEER_COPY", "1" });
#endif
#ifdef GGML_CUDA_F16
features.push_back({ "F16", "1" });
#endif
#ifdef GGML_CUDA_USE_GRAPHS
features.push_back({ "USE_GRAPHS", "1" });
#endif
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
#endif
#ifdef GGML_CUDA_FA_ALL_QUANTS
features.push_back({ "FA_ALL_QUANTS", "1" });
#endif
#undef _STRINGIFY
#undef STRINGIFY
features.push_back({ nullptr, nullptr });
return features;
}();
return features.data();
GGML_UNUSED(reg);
}
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
return (void *)ggml_backend_cuda_unregister_host_buffer;
}
if (strcmp(name, "ggml_backend_get_features") == 0) {
return (void *)ggml_backend_cuda_get_features;
}
return nullptr;
}
@ -3209,3 +3267,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
return cuda_backend;
}
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)

View File

@ -2182,3 +2182,5 @@ ggml_backend_reg_t ggml_backend_kompute_reg() {
return &reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)

View File

@ -4396,3 +4396,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
return &g_ggml_backend_metal_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)

View File

@ -1401,3 +1401,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
return dev;
}
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)

View File

@ -4678,3 +4678,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
return sycl_backend;
}
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)

View File

@ -7365,3 +7365,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
}
#endif
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)

View File

@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
g_logger_state.log_callback_user_data = user_data;
}
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
p->n_threads = n_threads;
p->prio = 0; // default priority (usually means normal or inherited)
p->poll = 50; // hybrid-polling enabled
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
p->paused = false; // threads are ready to go
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
}
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
struct ggml_threadpool_params p;
ggml_threadpool_params_init(&p, n_threads);
return p;
}
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
if (p0->n_threads != p1->n_threads ) return false;
if (p0->prio != p1->prio ) return false;
if (p0->poll != p1->poll ) return false;
if (p0->strict_cpu != p1->strict_cpu ) return false;
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
}

View File

@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(vdot)
if (NOT GGML_BACKEND_DL)
add_subdirectory(vdot)
endif()
endif()

View File

@ -4866,7 +4866,9 @@ struct llama_model_loader {
mappings.reserve(files.size());
mmaps_used.reserve(files.size());
for (const auto & file : files) {
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
mmaps_used.emplace_back(mapping->size, 0);
if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
if (!dev) {
// FIXME: workaround for CPU backend buft having a NULL device
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
ggml_backend_dev_props props;
ggml_backend_dev_get_props(dev, &props);
@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
int n_threads,
ggml_threadpool * threadpool) {
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
set_threadpool_fn(lctx.backend_cpu, threadpool);
}
// set the number of threads for all the backends
@ -19478,7 +19481,11 @@ void llama_backend_init(void) {
void llama_numa_init(enum ggml_numa_strategy numa) {
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
ggml_numa_init(numa);
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
numa_init_fn(numa);
}
}
@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model(
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
ctx->logits_all = params.logits_all;
// build worst-case graph for encoder if a model contains encoder
@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model(
}
// add CPU backend
ctx->backend_cpu = ggml_backend_cpu_init();
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
llama_free(ctx);
@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model(
}
}
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model(
std::vector<ggml_backend_t> backend_ptrs;
for (auto & backend : ctx->backends) {
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
// use the host buffer of the first device CPU for faster transfer of the intermediate state
auto * dev = model->devices[0];
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model(
// pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) {
for (auto & backend : ctx->backends) {
if (ggml_backend_is_cpu(backend.get())) {
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
// ignore CPU backend
continue;
}
@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
for (auto & backend : ctx->backends) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
if (set_abort_callback_fn) {
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
}
}
}
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
}
const char * llama_print_system_info(void) {
ggml_cpu_init(); // some ARM features are detected at runtime
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
for (int i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
if (get_features_fn) {
ggml_backend_feature * features = get_features_fn(reg);
s += ggml_backend_reg_name(reg);
s += " : ";
for (; features->name; features++) {
s += features->name;
s += " = ";
s += features->value;
s += " | ";
}
}
}
return s.c_str();
}

View File

@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
# llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-log.cpp)
llama_target_and_test(test-arg-parser.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)
llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-barrier.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")
if (NOT GGML_BACKEND_DL)
# these tests use the backends directly and cannot be built with dynamic loading
llama_target_and_test(test-barrier.cpp)
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-rope.cpp)
endif()
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)

View File

@ -16,7 +16,6 @@
#include <ggml.h>
#include <ggml-cpu.h>
#include <ggml-alloc.h>
#include <ggml-backend.h>
@ -26,7 +25,6 @@
#include <cstdint>
#include <cstring>
#include <cinttypes>
#include <functional>
#include <memory>
#include <random>
#include <stdio.h>
@ -639,19 +637,20 @@ struct test_case {
// determine number of runs
int n_runs;
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
if (op_flops(out) > 0) {
// based on flops
const uint64_t GFLOP = 1000 * 1000 * 1000;
const uint64_t target_flops_cpu = 8ULL * GFLOP;
const uint64_t target_flops_gpu = 100ULL * GFLOP;
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
} else {
// based on memory size
const size_t GB = 1ULL << 30;
const size_t target_size_cpu = 8 * GB;
const size_t target_size_gpu = 32 * GB;
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
}
@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
if (mode == MODE_TEST) {
auto test_cases = make_test_cases_eval();
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
if (backend_cpu == NULL) {
printf(" Failed to initialize CPU backend\n");
return false;
}
size_t n_ok = 0;
for (auto & test : test_cases) {
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
}
}
// enumerate backends
// load and enumerate backends
ggml_backend_load_all();
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
size_t n_ok = 0;
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
continue;
}
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
printf(" Skipping CPU backend\n");
ggml_backend_free(backend);
n_ok++;
continue;
}
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
GGML_ASSERT(backend != NULL);
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
if (ggml_backend_set_n_threads_fn) {