mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-29 04:44:34 +00:00
ggml : add support for dynamic loading of backends
This commit is contained in:
parent
55ed008b2d
commit
d5a3beb0e0
@ -377,6 +377,9 @@ void common_init() {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||||
|
|
||||||
|
// load dynamic backends
|
||||||
|
ggml_backend_load_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_params_get_system_info(const common_params & params) {
|
std::string common_params_get_system_info(const common_params & params) {
|
||||||
|
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(cvector-generator)
|
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
add_subdirectory(export-lora)
|
|
||||||
add_subdirectory(gbnf-validator)
|
add_subdirectory(gbnf-validator)
|
||||||
add_subdirectory(gguf-hash)
|
add_subdirectory(gguf-hash)
|
||||||
add_subdirectory(gguf-split)
|
add_subdirectory(gguf-split)
|
||||||
@ -27,28 +24,34 @@ else()
|
|||||||
add_subdirectory(imatrix)
|
add_subdirectory(imatrix)
|
||||||
add_subdirectory(infill)
|
add_subdirectory(infill)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
|
||||||
add_subdirectory(lookahead)
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(lookup)
|
add_subdirectory(lookup)
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(passkey)
|
add_subdirectory(passkey)
|
||||||
add_subdirectory(perplexity)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(quantize-stats)
|
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(retrieval)
|
add_subdirectory(retrieval)
|
||||||
if (GGML_RPC)
|
|
||||||
add_subdirectory(rpc)
|
|
||||||
endif()
|
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
|
||||||
if (GGML_SYCL)
|
|
||||||
add_subdirectory(sycl)
|
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(simple-chat)
|
add_subdirectory(simple-chat)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(tokenize)
|
add_subdirectory(tokenize)
|
||||||
|
if (NOT GGML_BACKEND_DL)
|
||||||
|
# these examples use the backends directly and cannot be built with dynamic loading
|
||||||
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
|
add_subdirectory(cvector-generator)
|
||||||
|
add_subdirectory(export-lora)
|
||||||
|
add_subdirectory(quantize-stats)
|
||||||
|
add_subdirectory(llava)
|
||||||
|
if (GGML_RPC)
|
||||||
|
add_subdirectory(rpc)
|
||||||
|
endif()
|
||||||
|
if (GGML_SYCL)
|
||||||
|
add_subdirectory(sycl)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
cmd_params params = parse_cmd_params(argc, argv);
|
cmd_params params = parse_cmd_params(argc, argv);
|
||||||
|
|
||||||
|
// initialize backends
|
||||||
|
ggml_backend_load_all();
|
||||||
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
|
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
||||||
|
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
||||||
|
|
||||||
// initialize llama.cpp
|
// initialize llama.cpp
|
||||||
if (!params.verbose) {
|
if (!params.verbose) {
|
||||||
llama_log_set(llama_null_log_callback, NULL);
|
llama_log_set(llama_null_log_callback, NULL);
|
||||||
@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) {
|
|||||||
tpp.poll = t.poll;
|
tpp.poll = t.poll;
|
||||||
tpp.prio = params.prio;
|
tpp.prio = params.prio;
|
||||||
|
|
||||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||||
if (!threadpool) {
|
if (!threadpool) {
|
||||||
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
ggml_threadpool_free(threadpool);
|
ggml_threadpool_free_fn(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
|
@ -165,6 +165,10 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
||||||
|
|
||||||
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||||
|
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
|
||||||
|
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
|
||||||
|
|
||||||
struct ggml_threadpool_params tpp_batch =
|
struct ggml_threadpool_params tpp_batch =
|
||||||
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||||
struct ggml_threadpool_params tpp =
|
struct ggml_threadpool_params tpp =
|
||||||
@ -174,7 +178,7 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
struct ggml_threadpool * threadpool_batch = NULL;
|
struct ggml_threadpool * threadpool_batch = NULL;
|
||||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
|
||||||
if (!threadpool_batch) {
|
if (!threadpool_batch) {
|
||||||
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||||
return 1;
|
return 1;
|
||||||
@ -184,7 +188,7 @@ int main(int argc, char ** argv) {
|
|||||||
tpp.paused = true;
|
tpp.paused = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
||||||
if (!threadpool) {
|
if (!threadpool) {
|
||||||
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
return 1;
|
return 1;
|
||||||
@ -890,8 +894,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
ggml_threadpool_free(threadpool);
|
ggml_threadpool_free_fn(threadpool);
|
||||||
ggml_threadpool_free(threadpool_batch);
|
ggml_threadpool_free_fn(threadpool_batch);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}, nullptr);
|
}, nullptr);
|
||||||
|
|
||||||
|
// load dynamic backends
|
||||||
|
ggml_backend_load_all();
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = ngl;
|
model_params.n_gpu_layers = ngl;
|
||||||
|
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// load dynamic backends
|
||||||
|
|
||||||
|
ggml_backend_load_all();
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
@ -33,6 +33,7 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||||
|
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
||||||
|
|
||||||
#
|
#
|
||||||
# option list
|
# option list
|
||||||
|
@ -190,6 +190,14 @@ extern "C" {
|
|||||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
|
||||||
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
// Get additional buffer types provided by the device (returns a NULL-terminated array)
|
||||||
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
|
||||||
|
// Set the abort callback for the backend
|
||||||
|
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
|
||||||
|
struct ggml_backend_feature {
|
||||||
|
const char * name;
|
||||||
|
const char * value;
|
||||||
|
};
|
||||||
|
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
@ -214,6 +222,11 @@ extern "C" {
|
|||||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
|
||||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||||
|
|
||||||
|
// Load a backend from a dynamic library
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
|
||||||
|
// Load all known backends from dynamic libraries
|
||||||
|
GGML_API void ggml_backend_load_all(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend scheduler
|
// Backend scheduler
|
||||||
//
|
//
|
||||||
|
@ -7,29 +7,6 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Scheduling priorities
|
|
||||||
enum ggml_sched_priority {
|
|
||||||
GGML_SCHED_PRIO_NORMAL,
|
|
||||||
GGML_SCHED_PRIO_MEDIUM,
|
|
||||||
GGML_SCHED_PRIO_HIGH,
|
|
||||||
GGML_SCHED_PRIO_REALTIME
|
|
||||||
};
|
|
||||||
|
|
||||||
// Threadpool params
|
|
||||||
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
|
||||||
struct ggml_threadpool_params {
|
|
||||||
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
|
||||||
int n_threads; // number of threads
|
|
||||||
enum ggml_sched_priority prio; // thread priority
|
|
||||||
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
|
||||||
bool strict_cpu; // strict cpu placement
|
|
||||||
bool paused; // start in paused state
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_threadpool; // forward declaration, see ggml.c
|
|
||||||
|
|
||||||
typedef struct ggml_threadpool * ggml_threadpool_t;
|
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
struct ggml_cplan {
|
struct ggml_cplan {
|
||||||
@ -75,14 +52,11 @@ extern "C" {
|
|||||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||||
|
|
||||||
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
|
||||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
|
||||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
|
||||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
@ -104,10 +78,10 @@ extern "C" {
|
|||||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||||
|
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||||
|
@ -2215,6 +2215,37 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
|
// ggml threadpool
|
||||||
|
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
|
||||||
|
// the goal should be to create an API that other backends can use move everything to the ggml base
|
||||||
|
|
||||||
|
// scheduling priorities
|
||||||
|
enum ggml_sched_priority {
|
||||||
|
GGML_SCHED_PRIO_NORMAL,
|
||||||
|
GGML_SCHED_PRIO_MEDIUM,
|
||||||
|
GGML_SCHED_PRIO_HIGH,
|
||||||
|
GGML_SCHED_PRIO_REALTIME
|
||||||
|
};
|
||||||
|
|
||||||
|
// threadpool params
|
||||||
|
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||||
|
struct ggml_threadpool_params {
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||||
|
int n_threads; // number of threads
|
||||||
|
enum ggml_sched_priority prio; // thread priority
|
||||||
|
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||||
|
bool strict_cpu; // strict cpu placement
|
||||||
|
bool paused; // start in paused state
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -202,6 +202,10 @@ endif()
|
|||||||
|
|
||||||
# ggml
|
# ggml
|
||||||
|
|
||||||
|
if (GGML_BACKEND_DL)
|
||||||
|
add_compile_definitions(GGML_BACKEND_DL)
|
||||||
|
endif()
|
||||||
|
|
||||||
add_library(ggml-base
|
add_library(ggml-base
|
||||||
../include/ggml.h
|
../include/ggml.h
|
||||||
../include/ggml-alloc.h
|
../include/ggml-alloc.h
|
||||||
@ -239,11 +243,18 @@ function(ggml_add_backend backend)
|
|||||||
if (${BUILD_SHARED_LIBS})
|
if (${BUILD_SHARED_LIBS})
|
||||||
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
|
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
|
||||||
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
|
target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED)
|
||||||
|
if (GGML_BACKEND_DL)
|
||||||
|
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if (GGML_BACKEND_DL)
|
||||||
|
install(TARGETS ${backend_target} RUNTIME)
|
||||||
|
else()
|
||||||
|
install(TARGETS ${backend_target} LIBRARY)
|
||||||
|
target_link_libraries(ggml PUBLIC ${backend_target})
|
||||||
|
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
||||||
|
target_compile_definitions(ggml PUBLIC ${backend_use})
|
||||||
endif()
|
endif()
|
||||||
install(TARGETS ${backend_target} LIBRARY)
|
|
||||||
target_link_libraries(ggml PUBLIC ${backend_target})
|
|
||||||
string(TOUPPER "GGML_USE_${backend}" backend_use)
|
|
||||||
target_compile_definitions(ggml PUBLIC ${backend_use})
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endfunction()
|
endfunction()
|
||||||
|
@ -444,3 +444,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
|
||||||
|
@ -204,12 +204,30 @@ extern "C" {
|
|||||||
void * context;
|
void * context;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// Internal backend registry API
|
// Internal backend registry API
|
||||||
void ggml_backend_register(ggml_backend_reg_t reg);
|
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||||
void ggml_backend_device_register(ggml_backend_dev_t device);
|
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||||
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
|
||||||
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
// Add backend dynamic loading support to the backend
|
||||||
|
#ifdef GGML_BACKEND_DL
|
||||||
|
#ifdef __cplusplus
|
||||||
|
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||||
|
extern "C" { \
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
|
||||||
|
} \
|
||||||
|
ggml_backend_reg_t ggml_backend_init() { \
|
||||||
|
return reg_fn(); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# define GGML_BACKEND_DL_IMPL(reg_fn) \
|
||||||
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \
|
||||||
|
ggml_backend_reg_t ggml_backend_init() { \
|
||||||
|
return reg_fn(); \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
# define GGML_BACKEND_DL_IMPL(reg_fn)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-cpu.h"
|
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// Backend registry
|
// Backend registry
|
||||||
|
#ifdef GGML_USE_CPU
|
||||||
|
#include "ggml-cpu.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
@ -75,8 +77,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_KOMPUTE
|
#ifdef GGML_USE_KOMPUTE
|
||||||
register_backend(ggml_backend_kompute_reg());
|
register_backend(ggml_backend_kompute_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_CPU
|
||||||
register_backend(ggml_backend_cpu_reg());
|
register_backend(ggml_backend_cpu_reg());
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void register_backend(ggml_backend_reg_t reg) {
|
void register_backend(ggml_backend_reg_t reg) {
|
||||||
@ -193,3 +196,86 @@ ggml_backend_t ggml_backend_init_best(void) {
|
|||||||
}
|
}
|
||||||
return ggml_backend_dev_init(dev, NULL);
|
return ggml_backend_dev_init(dev, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
# define WIN32_LEAN_AND_MEAN
|
||||||
|
# ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
# endif
|
||||||
|
# include <windows.h>
|
||||||
|
#else
|
||||||
|
# include <dlfcn.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||||
|
#ifdef _WIN32
|
||||||
|
HMODULE handle = LoadLibraryA(path);
|
||||||
|
if (!handle) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError());
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init");
|
||||||
|
if (!backend_init) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError());
|
||||||
|
FreeLibrary(handle);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ggml_backend_reg_t reg = backend_init();
|
||||||
|
if (!reg) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
|
||||||
|
FreeLibrary(handle);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||||
|
ggml_backend_register(reg);
|
||||||
|
return reg;
|
||||||
|
#else
|
||||||
|
void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
|
||||||
|
if (!handle) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror());
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init");
|
||||||
|
if (!backend_init) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror());
|
||||||
|
dlclose(handle);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
ggml_backend_reg_t reg = backend_init();
|
||||||
|
if (!reg) {
|
||||||
|
GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path);
|
||||||
|
dlclose(handle);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||||
|
ggml_backend_register(reg);
|
||||||
|
return reg;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_load_all() {
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll"
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
// path is hardcoded to the cmake build directory for now
|
||||||
|
// FIXME: should also search default system paths
|
||||||
|
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib"
|
||||||
|
#else
|
||||||
|
#define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("amx"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("blas"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("cann"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("cuda"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("hip"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("kompute"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("metal"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("rpc"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("sycl"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("vulkan"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("musa"));
|
||||||
|
ggml_backend_load(GGML_BACKEND_PATH("cpu"));
|
||||||
|
}
|
||||||
|
@ -512,3 +512,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
|||||||
|
|
||||||
return &ggml_backend_blas_reg;
|
return &ggml_backend_blas_reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
|
||||||
|
@ -2126,3 +2126,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
|||||||
ggml_cann_set_device(device);
|
ggml_cann_set_device(device);
|
||||||
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)
|
||||||
|
@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
|
|||||||
|
|
||||||
#endif // GGML_USE_OPENMP
|
#endif // GGML_USE_OPENMP
|
||||||
|
|
||||||
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
|
||||||
p->n_threads = n_threads;
|
|
||||||
p->prio = 0; // default priority (usually means normal or inherited)
|
|
||||||
p->poll = 50; // hybrid-polling enabled
|
|
||||||
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
|
||||||
p->paused = false; // threads are ready to go
|
|
||||||
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
|
||||||
struct ggml_threadpool_params p;
|
|
||||||
ggml_threadpool_params_init(&p, n_threads);
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
|
||||||
if (p0->n_threads != p1->n_threads ) return false;
|
|
||||||
if (p0->prio != p1->prio ) return false;
|
|
||||||
if (p0->poll != p1->poll ) return false;
|
|
||||||
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
|
||||||
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
static struct ggml_threadpool * ggml_threadpool_new_impl(
|
||||||
struct ggml_threadpool_params * tpp,
|
struct ggml_threadpool_params * tpp,
|
||||||
struct ggml_cgraph * cgraph,
|
struct ggml_cgraph * cgraph,
|
||||||
|
@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg
|
|||||||
return &ggml_backend_cpu_device;
|
return &ggml_backend_cpu_device;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_feature {
|
|
||||||
const char * name;
|
|
||||||
const char * value;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Not used yet
|
|
||||||
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
|
// and additionally to allow other backends to expose their own list of features that applications can query using the same API
|
||||||
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||||
static std::vector<ggml_backend_feature> features = []() {
|
static std::vector<ggml_backend_feature> features = []() {
|
||||||
|
ggml_cpu_init();
|
||||||
|
|
||||||
std::vector<ggml_backend_feature> features;
|
std::vector<ggml_backend_feature> features;
|
||||||
if (ggml_cpu_has_sse3()) {
|
if (ggml_cpu_has_sse3()) {
|
||||||
features.push_back({ "SSE3", "1" });
|
features.push_back({ "SSE3", "1" });
|
||||||
@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|||||||
if (ggml_cpu_has_avx()) {
|
if (ggml_cpu_has_avx()) {
|
||||||
features.push_back({ "AVX", "1" });
|
features.push_back({ "AVX", "1" });
|
||||||
}
|
}
|
||||||
|
if (ggml_cpu_has_avx_vnni()) {
|
||||||
|
features.push_back({ "AVX_VNNI", "1" });
|
||||||
|
}
|
||||||
if (ggml_cpu_has_avx2()) {
|
if (ggml_cpu_has_avx2()) {
|
||||||
features.push_back({ "AVX2", "1" });
|
features.push_back({ "AVX2", "1" });
|
||||||
}
|
}
|
||||||
@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|||||||
if (ggml_cpu_has_fma()) {
|
if (ggml_cpu_has_fma()) {
|
||||||
features.push_back({ "FMA", "1" });
|
features.push_back({ "FMA", "1" });
|
||||||
}
|
}
|
||||||
if (ggml_cpu_has_avx_vnni()) {
|
|
||||||
features.push_back({ "AVX_VNNI", "1" });
|
|
||||||
}
|
|
||||||
if (ggml_cpu_has_avx512()) {
|
if (ggml_cpu_has_avx512()) {
|
||||||
features.push_back({ "AVX512", "1" });
|
features.push_back({ "AVX512", "1" });
|
||||||
}
|
}
|
||||||
@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
|
|||||||
if (ggml_cpu_has_llamafile()) {
|
if (ggml_cpu_has_llamafile()) {
|
||||||
features.push_back({ "LLAMAFILE", "1" });
|
features.push_back({ "LLAMAFILE", "1" });
|
||||||
}
|
}
|
||||||
|
// TODO: rename this
|
||||||
|
#ifdef GGML_USE_CPU_AARCH64
|
||||||
|
features.push_back({ "AARCH64_REPACK", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
features.push_back({ nullptr, nullptr });
|
features.push_back({ nullptr, nullptr });
|
||||||
|
|
||||||
@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
|
|||||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||||
}
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_get_features;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_set_abort_callback") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_set_abort_callback;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) {
|
||||||
|
return (void *)ggml_numa_init;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) {
|
||||||
|
return (void *)ggml_is_numa;
|
||||||
|
}
|
||||||
|
|
||||||
|
// threadpool - TODO: move to ggml-base
|
||||||
|
if (strcmp(name, "ggml_threadpool_new") == 0) {
|
||||||
|
return (void *)ggml_threadpool_new;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_threadpool_free") == 0) {
|
||||||
|
return (void *)ggml_threadpool_free;
|
||||||
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) {
|
||||||
|
return (void *)ggml_backend_cpu_set_threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
@ -661,3 +684,5 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
|||||||
|
|
||||||
return &ggml_backend_cpu_reg;
|
return &ggml_backend_cpu_reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg)
|
||||||
|
@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re
|
|||||||
return ctx->devices[index];
|
return ctx->devices[index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) {
|
||||||
|
static std::vector<ggml_backend_feature> features = []() {
|
||||||
|
std::vector<ggml_backend_feature> features;
|
||||||
|
#define _STRINGIFY(...) #__VA_ARGS__
|
||||||
|
#define STRINGIFY(...) _STRINGIFY(__VA_ARGS__)
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH_LIST__
|
||||||
|
features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
|
features.push_back({ "FORCE_MMQ", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
|
features.push_back({ "FORCE_CUBLAS", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_NO_VMM
|
||||||
|
features.push_back({ "NO_VMM", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_NO_PEER_COPY
|
||||||
|
features.push_back({ "NO_PEER_COPY", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_F16
|
||||||
|
features.push_back({ "F16", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_USE_GRAPHS
|
||||||
|
features.push_back({ "USE_GRAPHS", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||||
|
features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FA_ALL_QUANTS
|
||||||
|
features.push_back({ "FA_ALL_QUANTS", "1" });
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#undef _STRINGIFY
|
||||||
|
#undef STRINGIFY
|
||||||
|
|
||||||
|
features.push_back({ nullptr, nullptr });
|
||||||
|
|
||||||
|
return features;
|
||||||
|
}();
|
||||||
|
|
||||||
|
return features.data();
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
GGML_UNUSED(reg);
|
GGML_UNUSED(reg);
|
||||||
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
|
||||||
@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con
|
|||||||
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) {
|
||||||
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
return (void *)ggml_backend_cuda_unregister_host_buffer;
|
||||||
}
|
}
|
||||||
|
if (strcmp(name, "ggml_backend_get_features") == 0) {
|
||||||
|
return (void *)ggml_backend_cuda_get_features;
|
||||||
|
}
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3209,3 +3267,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||||||
|
|
||||||
return cuda_backend;
|
return cuda_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg)
|
||||||
|
@ -2182,3 +2182,5 @@ ggml_backend_reg_t ggml_backend_kompute_reg() {
|
|||||||
|
|
||||||
return ®
|
return ®
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg)
|
||||||
|
@ -4396,3 +4396,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) {
|
|||||||
|
|
||||||
return &g_ggml_backend_metal_reg;
|
return &g_ggml_backend_metal_reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg)
|
||||||
|
@ -1401,3 +1401,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
|
|||||||
|
|
||||||
return dev;
|
return dev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg)
|
||||||
|
@ -4678,3 +4678,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
|
|||||||
return sycl_backend;
|
return sycl_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
|
||||||
|
@ -7365,3 +7365,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
|
|||||||
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
|
VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg)
|
||||||
|
@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
|||||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||||
g_logger_state.log_callback_user_data = user_data;
|
g_logger_state.log_callback_user_data = user_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
|
||||||
|
p->n_threads = n_threads;
|
||||||
|
p->prio = 0; // default priority (usually means normal or inherited)
|
||||||
|
p->poll = 50; // hybrid-polling enabled
|
||||||
|
p->strict_cpu = false; // no strict placement (all threads share same cpumask)
|
||||||
|
p->paused = false; // threads are ready to go
|
||||||
|
memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
|
||||||
|
struct ggml_threadpool_params p;
|
||||||
|
ggml_threadpool_params_init(&p, n_threads);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
|
||||||
|
if (p0->n_threads != p1->n_threads ) return false;
|
||||||
|
if (p0->prio != p1->prio ) return false;
|
||||||
|
if (p0->poll != p1->poll ) return false;
|
||||||
|
if (p0->strict_cpu != p1->strict_cpu ) return false;
|
||||||
|
return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
|
||||||
|
}
|
||||||
|
@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
else()
|
else()
|
||||||
add_subdirectory(vdot)
|
if (NOT GGML_BACKEND_DL)
|
||||||
|
add_subdirectory(vdot)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -4866,7 +4866,9 @@ struct llama_model_loader {
|
|||||||
mappings.reserve(files.size());
|
mappings.reserve(files.size());
|
||||||
mmaps_used.reserve(files.size());
|
mmaps_used.reserve(files.size());
|
||||||
for (const auto & file : files) {
|
for (const auto & file : files) {
|
||||||
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||||
|
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||||
|
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
||||||
mmaps_used.emplace_back(mapping->size, 0);
|
mmaps_used.emplace_back(mapping->size, 0);
|
||||||
if (mlock_mmaps) {
|
if (mlock_mmaps) {
|
||||||
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
||||||
@ -9190,7 +9192,7 @@ static bool llm_load_tensors(
|
|||||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
// FIXME: workaround for CPU backend buft having a NULL device
|
// FIXME: workaround for CPU backend buft having a NULL device
|
||||||
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
}
|
}
|
||||||
ggml_backend_dev_props props;
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute(
|
|||||||
int n_threads,
|
int n_threads,
|
||||||
ggml_threadpool * threadpool) {
|
ggml_threadpool * threadpool) {
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
|
||||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||||
|
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
// set the number of threads for all the backends
|
// set the number of threads for all the backends
|
||||||
@ -19478,7 +19481,11 @@ void llama_backend_init(void) {
|
|||||||
|
|
||||||
void llama_numa_init(enum ggml_numa_strategy numa) {
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||||
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
||||||
ggml_numa_init(numa);
|
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||||
|
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||||
|
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||||
|
numa_init_fn(numa);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->abort_callback = params.abort_callback;
|
|
||||||
ctx->abort_callback_data = params.abort_callback_data;
|
|
||||||
|
|
||||||
ctx->logits_all = params.logits_all;
|
ctx->logits_all = params.logits_all;
|
||||||
|
|
||||||
// build worst-case graph for encoder if a model contains encoder
|
// build worst-case graph for encoder if a model contains encoder
|
||||||
@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// add CPU backend
|
// add CPU backend
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
||||||
|
|
||||||
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
||||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
std::vector<ggml_backend_t> backend_ptrs;
|
std::vector<ggml_backend_t> backend_ptrs;
|
||||||
for (auto & backend : ctx->backends) {
|
for (auto & backend : ctx->backends) {
|
||||||
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
||||||
if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) {
|
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||||
|
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
||||||
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
||||||
auto * dev = model->devices[0];
|
auto * dev = model->devices[0];
|
||||||
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
||||||
@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
// pipeline parallelism requires support for async compute and events in all devices
|
// pipeline parallelism requires support for async compute and events in all devices
|
||||||
if (pipeline_parallel) {
|
if (pipeline_parallel) {
|
||||||
for (auto & backend : ctx->backends) {
|
for (auto & backend : ctx->backends) {
|
||||||
if (ggml_backend_is_cpu(backend.get())) {
|
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
||||||
|
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
||||||
// ignore CPU backend
|
// ignore CPU backend
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|||||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
||||||
ctx->abort_callback = abort_callback;
|
ctx->abort_callback = abort_callback;
|
||||||
ctx->abort_callback_data = abort_callback_data;
|
ctx->abort_callback_data = abort_callback_data;
|
||||||
|
|
||||||
|
for (auto & backend : ctx->backends) {
|
||||||
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
||||||
|
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
||||||
|
if (set_abort_callback_fn) {
|
||||||
|
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
||||||
@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|||||||
}
|
}
|
||||||
|
|
||||||
const char * llama_print_system_info(void) {
|
const char * llama_print_system_info(void) {
|
||||||
ggml_cpu_init(); // some ARM features are detected at runtime
|
|
||||||
|
|
||||||
static std::string s;
|
static std::string s;
|
||||||
|
|
||||||
s = "";
|
for (int i = 0; i < ggml_backend_reg_count(); i++) {
|
||||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
auto * reg = ggml_backend_reg_get(i);
|
||||||
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
|
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
||||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
if (get_features_fn) {
|
||||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
ggml_backend_feature * features = get_features_fn(reg);
|
||||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
s += ggml_backend_reg_name(reg);
|
||||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
s += " : ";
|
||||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
for (; features->name; features++) {
|
||||||
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
|
s += features->name;
|
||||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
s += " = ";
|
||||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
s += features->value;
|
||||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
s += " | ";
|
||||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
}
|
||||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
}
|
||||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
}
|
||||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
|
||||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
||||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
|
||||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
|
||||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
||||||
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
|
||||||
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
|
|||||||
# llama_target_and_test(test-double-float.cpp) # SLOW
|
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||||
llama_target_and_test(test-log.cpp)
|
llama_target_and_test(test-log.cpp)
|
||||||
llama_target_and_test(test-arg-parser.cpp)
|
llama_target_and_test(test-arg-parser.cpp)
|
||||||
llama_target_and_test(test-quantize-fns.cpp)
|
|
||||||
llama_target_and_test(test-quantize-perf.cpp)
|
|
||||||
llama_target_and_test(test-sampling.cpp)
|
llama_target_and_test(test-sampling.cpp)
|
||||||
llama_target_and_test(test-chat-template.cpp)
|
llama_target_and_test(test-chat-template.cpp)
|
||||||
|
|
||||||
llama_target_and_test(test-grammar-parser.cpp)
|
llama_target_and_test(test-grammar-parser.cpp)
|
||||||
llama_target_and_test(test-grammar-integration.cpp)
|
llama_target_and_test(test-grammar-integration.cpp)
|
||||||
llama_target_and_test(test-llama-grammar.cpp)
|
llama_target_and_test(test-llama-grammar.cpp)
|
||||||
llama_target_and_test(test-barrier.cpp)
|
|
||||||
# llama_target_and_test(test-opt.cpp) # SLOW
|
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||||
llama_target_and_test(test-backend-ops.cpp)
|
llama_target_and_test(test-backend-ops.cpp)
|
||||||
|
|
||||||
llama_target_and_test(test-rope.cpp)
|
|
||||||
|
|
||||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||||
|
|
||||||
|
if (NOT GGML_BACKEND_DL)
|
||||||
|
# these tests use the backends directly and cannot be built with dynamic loading
|
||||||
|
llama_target_and_test(test-barrier.cpp)
|
||||||
|
llama_target_and_test(test-quantize-fns.cpp)
|
||||||
|
llama_target_and_test(test-quantize-perf.cpp)
|
||||||
|
llama_target_and_test(test-rope.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||||
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||||
|
@ -16,7 +16,6 @@
|
|||||||
|
|
||||||
|
|
||||||
#include <ggml.h>
|
#include <ggml.h>
|
||||||
#include <ggml-cpu.h>
|
|
||||||
#include <ggml-alloc.h>
|
#include <ggml-alloc.h>
|
||||||
#include <ggml-backend.h>
|
#include <ggml-backend.h>
|
||||||
|
|
||||||
@ -26,7 +25,6 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <functional>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -639,19 +637,20 @@ struct test_case {
|
|||||||
|
|
||||||
// determine number of runs
|
// determine number of runs
|
||||||
int n_runs;
|
int n_runs;
|
||||||
|
bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||||
if (op_flops(out) > 0) {
|
if (op_flops(out) > 0) {
|
||||||
// based on flops
|
// based on flops
|
||||||
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
const uint64_t GFLOP = 1000 * 1000 * 1000;
|
||||||
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
const uint64_t target_flops_cpu = 8ULL * GFLOP;
|
||||||
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
const uint64_t target_flops_gpu = 100ULL * GFLOP;
|
||||||
uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu;
|
uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu;
|
||||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1;
|
||||||
} else {
|
} else {
|
||||||
// based on memory size
|
// based on memory size
|
||||||
const size_t GB = 1ULL << 30;
|
const size_t GB = 1ULL << 30;
|
||||||
const size_t target_size_cpu = 8 * GB;
|
const size_t target_size_cpu = 8 * GB;
|
||||||
const size_t target_size_gpu = 32 * GB;
|
const size_t target_size_gpu = 32 * GB;
|
||||||
size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu;
|
size_t target_size = is_cpu ? target_size_cpu : target_size_gpu;
|
||||||
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
n_runs = std::min<int>(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3873,7 +3872,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||||||
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
|
||||||
if (mode == MODE_TEST) {
|
if (mode == MODE_TEST) {
|
||||||
auto test_cases = make_test_cases_eval();
|
auto test_cases = make_test_cases_eval();
|
||||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL);
|
||||||
|
if (backend_cpu == NULL) {
|
||||||
|
printf(" Failed to initialize CPU backend\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
size_t n_ok = 0;
|
size_t n_ok = 0;
|
||||||
for (auto & test : test_cases) {
|
for (auto & test : test_cases) {
|
||||||
@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// enumerate backends
|
// load and enumerate backends
|
||||||
|
ggml_backend_load_all();
|
||||||
|
|
||||||
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
printf("Testing %zu devices\n\n", ggml_backend_dev_count());
|
||||||
|
|
||||||
size_t n_ok = 0;
|
size_t n_ok = 0;
|
||||||
@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) {
|
||||||
GGML_ASSERT(backend != NULL);
|
|
||||||
|
|
||||||
if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) {
|
|
||||||
printf(" Skipping CPU backend\n");
|
printf(" Skipping CPU backend\n");
|
||||||
ggml_backend_free(backend);
|
|
||||||
n_ok++;
|
n_ok++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
|
||||||
|
GGML_ASSERT(backend != NULL);
|
||||||
|
|
||||||
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
||||||
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
|
||||||
if (ggml_backend_set_n_threads_fn) {
|
if (ggml_backend_set_n_threads_fn) {
|
||||||
|
Loading…
Reference in New Issue
Block a user