mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
ggml : add support for dynamic loading of backends
This commit is contained in:
parent
55ed008b2d
commit
d5a3beb0e0
@ -377,6 +377,9 @@ void common_init() {
|
||||
#endif
|
||||
|
||||
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
}
|
||||
|
||||
std::string common_params_get_system_info(const common_params & params) {
|
||||
|
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(eval-callback)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(gbnf-validator)
|
||||
add_subdirectory(gguf-hash)
|
||||
add_subdirectory(gguf-split)
|
||||
@ -27,28 +24,34 @@ else()
|
||||
add_subdirectory(imatrix)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(lookahead)
|
||||
add_subdirectory(lookup)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(passkey)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(retrieval)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (LLAMA_BUILD_SERVER)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(simple-chat)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(tokenize)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these examples use the backends directly and cannot be built with dynamic loading
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(cvector-generator)
|
||||
add_subdirectory(export-lora)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(llava)
|
||||
if (GGML_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
|
||||
|
||||
cmd_params params = parse_cmd_params(argc, argv);
|
||||
|
||||
// initialize backends
|
||||
ggml_backend_load_all();
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
if (!cpu_dev) {
|
||||
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
||||
|
||||
// initialize llama.cpp
|
||||
if (!params.verbose) {
|
||||
llama_log_set(llama_null_log_callback, NULL);
|
||||
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
|
||||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
exit(1);
|
||||
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
}
|
||||
|
||||
llama_free_model(lmodel);
|
||||
|
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
|
||||
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
|
||||
|
||||
struct ggml_threadpool_params tpp_batch =
|
||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||
struct ggml_threadpool_params tpp =
|
||||
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
struct ggml_threadpool * threadpool_batch = NULL;
|
||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
|
||||
if (!threadpool_batch) {
|
||||
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||
return 1;
|
||||
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
|
||||
tpp.paused = true;
|
||||
}
|
||||
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||
if (!threadpool) {
|
||||
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||
return 1;
|
||||
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
ggml_threadpool_free(threadpool);
|
||||
ggml_threadpool_free(threadpool_batch);
|
||||
ggml_threadpool_free_fn(threadpool);
|
||||
ggml_threadpool_free_fn(threadpool_batch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}, nullptr);
|
||||
|
||||
// load dynamic backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = ngl;
|
||||
|
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// load dynamic backends
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
@ -33,6 +33,7 @@ else()
|
||||
endif()
|
||||
|
||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||
|
||||
#
|
||||
# option list
|
||||
|
@ -190,6 +190,14 @@ extern "C" {
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||
// Set the abort callback for the backend
|
||||
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
@ -214,6 +222,11 @@ extern "C" {
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
// Load a backend from a dynamic library
|
||||
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
||||
// Load all known backends from dynamic libraries
|
||||
GGML_API void ggml_backend_load_all(void);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
@ -7,29 +7,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// Threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||
// since https://github.com/ggerganov/ggml/issues/287
|
||||
struct ggml_cplan {
|
||||
@ -75,14 +52,11 @@ extern "C" {
|
||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
@ -104,10 +78,10 @@ extern "C" {
|
||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||
|
@ -2215,6 +2215,37 @@ extern "C" {
|
||||
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
||||
// ggml threadpool
|
||||
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
|
||||
// the goal should be to create an API that other backends can use move everything to the ggml base
|
||||
|
||||
// scheduling priorities
|
||||
enum ggml_sched_priority {
|
||||
GGML_SCHED_PRIO_NORMAL,
|
||||
GGML_SCHED_PRIO_MEDIUM,
|
||||
GGML_SCHED_PRIO_HIGH,
|
||||
GGML_SCHED_PRIO_REALTIME
|
||||
};
|
||||
|
||||
// threadpool params
|
||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||
struct ggml_threadpool_params {
|
||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||
int n_threads; // number of threads
|
||||
enum ggml_sched_priority prio; // thread priority
|
||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||
bool strict_cpu; // strict cpu placement
|
||||
bool paused; // start in paused state
|
||||
};
|
||||
|
||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||
|
||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -202,6 +202,10 @@ endif()
|
||||
|
||||
# ggml
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
add_compile_definitions(GGML_BACKEND_DL)
|
||||
endif()
|
||||
|
||||
add_library(ggml-base
|
||||
../include/ggml.h
|
||||
../include/ggml-alloc.h
|
||||
@ -239,11 +243,18 @@ function(ggml_add_backend backend)
|
||||
if (${BUILD_SHARED_LIBS})
|
||||
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
|
||||
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
|
||||
if (GGML_BACKEND_DL)
|
||||
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_BACKEND_DL)
|
||||
install(TARGETS ${backend_target} RUNTIME)
|
||||
else()
|
||||
install(TARGETS ${backend_target} LIBRARY)
|
||||
target_link_libraries(ggml PUBLIC ${backend_target})
|
||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||
endif()
|
||||
install(TARGETS ${backend_target} LIBRARY)
|
||||
target_link_libraries(ggml PUBLIC ${backend_target})
|
||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
@ -444,3 +444,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
|
||||
|
@ -204,12 +204,30 @@ extern "C" {
|
||||
void * context;
|
||||
};
|
||||
|
||||
|
||||
// Internal backend registry API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
||||
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
||||
|
||||
// Add backend dynamic loading support to the backend
|
||||
#ifdef GGML_BACKEND_DL
|
||||
#ifdef __cplusplus
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
extern "C" { \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
|
||||
} \
|
||||
ggml_backend_reg_t ggml_backend_init() { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
#else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
|
||||
ggml_backend_reg_t ggml_backend_init() { \
|
||||
return reg_fn(); \
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -1,11 +1,13 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
// Backend registry
|
||||
#ifdef GGML_USE_CPU
|
||||
#include "ggml-cpu.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
@ -75,8 +77,9 @@ struct ggml_backend_registry {
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
#endif
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg) {
|
||||
@ -193,3 +196,86 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
}
|
||||
return ggml_backend_dev_init(dev, NULL);
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
# endif
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
||||
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
#ifdef _WIN32
|
||||
HMODULE handle = LoadLibraryA(path);
|
||||
if (!handle) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
|
||||
return NULL;
|
||||
}
|
||||
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
|
||||
if (!backend_init) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
|
||||
FreeLibrary(handle);
|
||||
return NULL;
|
||||
}
|
||||
ggml_backend_reg_t reg = backend_init();
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
|
||||
FreeLibrary(handle);
|
||||
return NULL;
|
||||
}
|
||||
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||
ggml_backend_register(reg);
|
||||
return reg;
|
||||
#else
|
||||
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
|
||||
if (!handle) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
|
||||
return NULL;
|
||||
}
|
||||
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
|
||||
if (!backend_init) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
|
||||
dlclose(handle);
|
||||
return NULL;
|
||||
}
|
||||
ggml_backend_reg_t reg = backend_init();
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
|
||||
dlclose(handle);
|
||||
return NULL;
|
||||
}
|
||||
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||
ggml_backend_register(reg);
|
||||
return reg;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
#ifdef _WIN32
|
||||
#define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll"
|
||||
#elif defined(__APPLE__)
|
||||
// path is hardcoded to the cmake build directory for now
|
||||
// FIXME: should also search default system paths
|
||||
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib"
|
||||
#else
|
||||
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so"
|
||||
#endif
|
||||
|
||||
ggml_backend_load(GGML_BACKEND_PATH("amx"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("blas"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("cann"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("cuda"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("hip"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("kompute"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("metal"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("rpc"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("sycl"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("vulkan"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("musa"));
|
||||
ggml_backend_load(GGML_BACKEND_PATH("cpu"));
|
||||
}
|
||||
|
@ -512,3 +512,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||
|
||||
return &ggml_backend_blas_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
|
||||
|
@ -2126,3 +2126,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
|
||||
|
@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
||||
|
||||
#endif // GGML_USE_OPENMP
|
||||
|
||||
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
||||
p->n_threads = n_threads;
|
||||
p->prio = 0; // default priority (usually means normal or inherited)
|
||||
p->poll = 50; // hybrid-polling enabled
|
||||
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
||||
p->paused = false; // threads are ready to go
|
||||
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
||||
struct ggml_threadpool_params p;
|
||||
ggml_threadpool_params_init(&p, n_threads);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
||||
if (p0->n_threads != p1->n_threads ) return false;
|
||||
if (p0->prio != p1->prio ) return false;
|
||||
if (p0->poll != p1->poll ) return false;
|
||||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||
}
|
||||
|
||||
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||
struct ggml_threadpool_params * tpp,
|
||||
struct ggml_cgraph * cgraph,
|
||||
|
@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
|
||||
return &ggml_backend_cpu_device;
|
||||
}
|
||||
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
|
||||
// Not used yet
|
||||
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API
|
||||
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
ggml_cpu_init();
|
||||
|
||||
std::vector<ggml_backend_feature> features;
|
||||
if (ggml_cpu_has_sse3()) {
|
||||
features.push_back({ "SSE3", "1" });
|
||||
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_avx()) {
|
||||
features.push_back({ "AVX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
features.push_back({ "AVX2", "1" });
|
||||
}
|
||||
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
||||
if (ggml_cpu_has_llamafile()) {
|
||||
features.push_back({ "LLAMAFILE", "1" });
|
||||
}
|
||||
// TODO: rename this
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
features.push_back({ "AARCH64_REPACK", "1" });
|
||||
#endif
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_features;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_abort_callback;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
|
||||
return (void *)ggml_numa_init;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
|
||||
return (void *)ggml_is_numa;
|
||||
}
|
||||
|
||||
// threadpool - TODO: move to ggml-base
|
||||
if (strcmp(name, "ggml_threadpool_new") == 0) {
|
||||
return (void *)ggml_threadpool_new;
|
||||
}
|
||||
if (strcmp(name, "ggml_threadpool_free") == 0) {
|
||||
return (void *)ggml_threadpool_free;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_threadpool;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
@ -661,3 +684,5 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||
|
||||
return &ggml_backend_cpu_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
|
||||
|
@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
|
||||
return ctx->devices[index];
|
||||
}
|
||||
|
||||
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
std::vector<ggml_backend_feature> features;
|
||||
#define _STRINGIFY(...) #__VA_ARGS__
|
||||
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
|
||||
|
||||
#ifdef __CUDA_ARCH_LIST__
|
||||
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
features.push_back({ "FORCE_MMQ", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||
features.push_back({ "FORCE_CUBLAS", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_NO_VMM
|
||||
features.push_back({ "NO_VMM", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||
features.push_back({ "NO_PEER_COPY", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_F16
|
||||
features.push_back({ "F16", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_USE_GRAPHS
|
||||
features.push_back({ "USE_GRAPHS", "1" });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
|
||||
#endif
|
||||
|
||||
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||
features.push_back({ "FA_ALL_QUANTS", "1" });
|
||||
#endif
|
||||
|
||||
#undef _STRINGIFY
|
||||
#undef STRINGIFY
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
return features;
|
||||
}();
|
||||
|
||||
return features.data();
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
GGML_UNUSED(reg);
|
||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
||||
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
||||
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||
return (void *)ggml_backend_cuda_get_features;
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -3209,3 +3267,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||
|
||||
return cuda_backend;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
|
||||
|
@ -2182,3 +2182,5 @@ ggml_backend_reg_t ggml_backend_kompute_reg() {
|
||||
|
||||
return ®
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
|
||||
|
@ -4396,3 +4396,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
||||
|
||||
return &g_ggml_backend_metal_reg;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
|
||||
|
@ -1401,3 +1401,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
|
||||
|
||||
return dev;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
|
||||
|
@ -4678,3 +4678,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
return sycl_backend;
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
|
||||
|
@ -7365,3 +7365,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
||||
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
|
||||
|
@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
||||
p->n_threads = n_threads;
|
||||
p->prio = 0; // default priority (usually means normal or inherited)
|
||||
p->poll = 50; // hybrid-polling enabled
|
||||
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
||||
p->paused = false; // threads are ready to go
|
||||
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
||||
struct ggml_threadpool_params p;
|
||||
ggml_threadpool_params_init(&p, n_threads);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
||||
if (p0->n_threads != p1->n_threads ) return false;
|
||||
if (p0->prio != p1->prio ) return false;
|
||||
if (p0->poll != p1->poll ) return false;
|
||||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||
}
|
||||
|
@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(vdot)
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
add_subdirectory(vdot)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -4866,7 +4866,9 @@ struct llama_model_loader {
|
||||
mappings.reserve(files.size());
|
||||
mmaps_used.reserve(files.size());
|
||||
for (const auto & file : files) {
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
||||
mmaps_used.emplace_back(mapping->size, 0);
|
||||
if (mlock_mmaps) {
|
||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||
@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
// FIXME: workaround for CPU backend buft having a NULL device
|
||||
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
|
||||
int n_threads,
|
||||
ggml_threadpool * threadpool) {
|
||||
if (lctx.backend_cpu != nullptr) {
|
||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
|
||||
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
||||
}
|
||||
|
||||
// set the number of threads for all the backends
|
||||
@ -19478,7 +19481,11 @@ void llama_backend_init(void) {
|
||||
|
||||
void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
||||
ggml_numa_init(numa);
|
||||
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||
numa_init_fn(numa);
|
||||
}
|
||||
}
|
||||
|
||||
@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model(
|
||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||
}
|
||||
|
||||
ctx->abort_callback = params.abort_callback;
|
||||
ctx->abort_callback_data = params.abort_callback_data;
|
||||
|
||||
ctx->logits_all = params.logits_all;
|
||||
|
||||
// build worst-case graph for encoder if a model contains encoder
|
||||
@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
|
||||
// add CPU backend
|
||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||
if (ctx->backend_cpu == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||
llama_free(ctx);
|
||||
@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
}
|
||||
|
||||
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
||||
|
||||
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
for (auto & backend : ctx->backends) {
|
||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
|
||||
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
||||
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
||||
auto * dev = model->devices[0];
|
||||
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
||||
@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
// pipeline parallelism requires support for async compute and events in all devices
|
||||
if (pipeline_parallel) {
|
||||
for (auto & backend : ctx->backends) {
|
||||
if (ggml_backend_is_cpu(backend.get())) {
|
||||
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||
// ignore CPU backend
|
||||
continue;
|
||||
}
|
||||
@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
|
||||
for (auto & backend : ctx->backends) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
||||
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||
if (set_abort_callback_fn) {
|
||||
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||
@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
||||
}
|
||||
|
||||
const char * llama_print_system_info(void) {
|
||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
||||
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
||||
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
||||
for (int i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
auto * reg = ggml_backend_reg_get(i);
|
||||
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
||||
if (get_features_fn) {
|
||||
ggml_backend_feature * features = get_features_fn(reg);
|
||||
s += ggml_backend_reg_name(reg);
|
||||
s += " : ";
|
||||
for (; features->name; features++) {
|
||||
s += features->name;
|
||||
s += " = ";
|
||||
s += features->value;
|
||||
s += " | ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||
llama_target_and_test(test-log.cpp)
|
||||
llama_target_and_test(test-arg-parser.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-sampling.cpp)
|
||||
llama_target_and_test(test-chat-template.cpp)
|
||||
|
||||
llama_target_and_test(test-grammar-parser.cpp)
|
||||
llama_target_and_test(test-grammar-integration.cpp)
|
||||
llama_target_and_test(test-llama-grammar.cpp)
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||
llama_target_and_test(test-backend-ops.cpp)
|
||||
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
|
||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||
|
||||
if (NOT GGML_BACKEND_DL)
|
||||
# these tests use the backends directly and cannot be built with dynamic loading
|
||||
llama_target_and_test(test-barrier.cpp)
|
||||
llama_target_and_test(test-quantize-fns.cpp)
|
||||
llama_target_and_test(test-quantize-perf.cpp)
|
||||
llama_target_and_test(test-rope.cpp)
|
||||
endif()
|
||||
|
||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||
|
@ -16,7 +16,6 @@
|
||||
|
||||
|
||||
#include <ggml.h>
|
||||
#include <ggml-cpu.h>
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
|
||||
@ -26,7 +25,6 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <cinttypes>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <stdio.h>
|
||||
@ -639,19 +637,20 @@ struct test_case {
|
||||
|
||||
// determine number of runs
|
||||
int n_runs;
|
||||
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
if (op_flops(out) > 0) {
|
||||
// based on flops
|
||||
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
||||
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
||||
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
||||
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
|
||||
uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
||||
} else {
|
||||
// based on memory size
|
||||
const size_t GB = 1ULL << 30;
|
||||
const size_t target_size_cpu = 8 * GB;
|
||||
const size_t target_size_gpu = 32 * GB;
|
||||
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
|
||||
size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
|
||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||
}
|
||||
|
||||
@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||
if (mode == MODE_TEST) {
|
||||
auto test_cases = make_test_cases_eval();
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||
if (backend_cpu == NULL) {
|
||||
printf(" Failed to initialize CPU backend\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t n_ok = 0;
|
||||
for (auto & test : test_cases) {
|
||||
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// enumerate backends
|
||||
// load and enumerate backends
|
||||
ggml_backend_load_all();
|
||||
|
||||
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
||||
|
||||
size_t n_ok = 0;
|
||||
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
||||
if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
|
||||
printf(" Skipping CPU backend\n");
|
||||
ggml_backend_free(backend);
|
||||
n_ok++;
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||
GGML_ASSERT(backend != NULL);
|
||||
|
||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||
if (ggml_backend_set_n_threads_fn) {
|
||||
|
Loading…
Reference in New Issue
Block a user