diff --git a/common/common.cpp b/common/common.cpp index d314523db..1e06cad72 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -377,6 +377,9 @@ void common_init() { #endif LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + + // load dynamic backends + ggml_backend_load_all(); } std::string common_params_get_system_info(const common_params & params) { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d63a96c1c..bceded4e4 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,28 +24,34 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75c..bac606f47 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7c4ce4be2..775eec1b1 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 5f9973163..7f4da666b 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -62,6 +62,9 @@ int main(int argc, char ** argv) { } }, nullptr); + // load dynamic backends + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 59760fe95..3288c0250 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -74,6 +74,10 @@ int main(int argc, char ** argv) { } } + // load dynamic backends + + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2d32da1b6..70b5cfdf7 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -33,6 +33,7 @@ else() endif() option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) +option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) # # option list diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cef164764..d9aca71ae 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -190,6 +190,14 @@ extern "C" { typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); // // Backend registry @@ -214,6 +222,11 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); + // Load a backend from a dynamic library + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + // // Backend scheduler // diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef979..a5358d047 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,29 +7,6 @@ extern "C" { #endif - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -75,14 +52,11 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -104,10 +78,10 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 69e6a2434..9843b09fb 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2215,6 +2215,37 @@ extern "C" { GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8df0e85c0..c506a413d 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,6 +202,10 @@ endif() # ggml +if (GGML_BACKEND_DL) + add_compile_definitions(GGML_BACKEND_DL) +endif() + add_library(ggml-base ../include/ggml.h ../include/ggml-alloc.h @@ -239,11 +243,18 @@ function(ggml_add_backend backend) if (${BUILD_SHARED_LIBS}) target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) + if (GGML_BACKEND_DL) + target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL) + endif() + endif() + if (GGML_BACKEND_DL) + install(TARGETS ${backend_target} RUNTIME) + else() + install(TARGETS ${backend_target} LIBRARY) + target_link_libraries(ggml PUBLIC ${backend_target}) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 8568e7965..0e1326668 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -444,3 +444,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) { } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fa8d5b7fb..e22ddbb02 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -204,12 +204,30 @@ extern "C" { void * context; }; - // Internal backend registry API void ggml_backend_register(ggml_backend_reg_t reg); void ggml_backend_device_register(ggml_backend_dev_t device); - // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function - // typedef ggml_backend_register_t * (*ggml_backend_init)(void); + + // Add backend dynamic loading support to the backend + #ifdef GGML_BACKEND_DL + #ifdef __cplusplus + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ + } \ + ggml_backend_reg_t ggml_backend_init() { \ + return reg_fn(); \ + } + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ + ggml_backend_reg_t ggml_backend_init() { \ + return reg_fn(); \ + } + #endif + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) + #endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d8201..6b68c956c 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,11 +1,13 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml-impl.h" #include #include // Backend registry +#ifdef GGML_USE_CPU +#include "ggml-cpu.h" +#endif #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -75,8 +77,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif - +#ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); +#endif } void register_backend(ggml_backend_reg_t reg) { @@ -193,3 +196,86 @@ ggml_backend_t ggml_backend_init_best(void) { } return ggml_backend_dev_init(dev, NULL); } + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +#endif + +typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + +ggml_backend_reg_t ggml_backend_load(const char * path) { +#ifdef _WIN32 + HMODULE handle = LoadLibraryA(path); + if (!handle) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + return NULL; + } + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + if (!backend_init) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + FreeLibrary(handle); + return NULL; + } + ggml_backend_reg_t reg = backend_init(); + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + FreeLibrary(handle); + return NULL; + } + GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + ggml_backend_register(reg); + return reg; +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + return NULL; + } + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + if (!backend_init) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + dlclose(handle); + return NULL; + } + ggml_backend_reg_t reg = backend_init(); + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + dlclose(handle); + return NULL; + } + GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + ggml_backend_register(reg); + return reg; +#endif +} + +void ggml_backend_load_all() { +#ifdef _WIN32 + #define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll" +#elif defined(__APPLE__) + // path is hardcoded to the cmake build directory for now + // FIXME: should also search default system paths + #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib" +#else + #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so" +#endif + + ggml_backend_load(GGML_BACKEND_PATH("amx")); + ggml_backend_load(GGML_BACKEND_PATH("blas")); + ggml_backend_load(GGML_BACKEND_PATH("cann")); + ggml_backend_load(GGML_BACKEND_PATH("cuda")); + ggml_backend_load(GGML_BACKEND_PATH("hip")); + ggml_backend_load(GGML_BACKEND_PATH("kompute")); + ggml_backend_load(GGML_BACKEND_PATH("metal")); + ggml_backend_load(GGML_BACKEND_PATH("rpc")); + ggml_backend_load(GGML_BACKEND_PATH("sycl")); + ggml_backend_load(GGML_BACKEND_PATH("vulkan")); + ggml_backend_load(GGML_BACKEND_PATH("musa")); + ggml_backend_load(GGML_BACKEND_PATH("cpu")); +} diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 648c9d875..b3f804937 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -512,3 +512,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) { return &ggml_backend_blas_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881..e592560c6 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2126,3 +2126,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4b58254e7..c6ede19d9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int #endif // GGML_USE_OPENMP -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9..55f88992a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg return &ggml_backend_cpu_device; } -struct ggml_backend_feature { - const char * name; - const char * value; -}; - -// Not used yet // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, -// and additionally to allow other backends to expose their own list of features that applications can query using the same API. +// and additionally to allow other backends to expose their own list of features that applications can query using the same API static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static std::vector features = []() { + ggml_cpu_init(); + std::vector features; if (ggml_cpu_has_sse3()) { features.push_back({ "SSE3", "1" }); @@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_avx()) { features.push_back({ "AVX", "1" }); } + if (ggml_cpu_has_avx_vnni()) { + features.push_back({ "AVX_VNNI", "1" }); + } if (ggml_cpu_has_avx2()) { features.push_back({ "AVX2", "1" }); } @@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_fma()) { features.push_back({ "FMA", "1" }); } - if (ggml_cpu_has_avx_vnni()) { - features.push_back({ "AVX_VNNI", "1" }); - } if (ggml_cpu_has_avx512()) { features.push_back({ "AVX512", "1" }); } @@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } + // TODO: rename this + #ifdef GGML_USE_CPU_AARCH64 + features.push_back({ "AARCH64_REPACK", "1" }); + #endif features.push_back({ nullptr, nullptr }); @@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { return (void *)ggml_backend_cpu_get_extra_bufts; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cpu_get_features; + } + if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { + return (void *)ggml_backend_cpu_set_abort_callback; + } + if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { + return (void *)ggml_numa_init; + } + if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { + return (void *)ggml_is_numa; + } + + // threadpool - TODO: move to ggml-base + if (strcmp(name, "ggml_threadpool_new") == 0) { + return (void *)ggml_threadpool_new; + } + if (strcmp(name, "ggml_threadpool_free") == 0) { + return (void *)ggml_threadpool_free; + } + if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { + return (void *)ggml_backend_cpu_set_threadpool; + } return NULL; @@ -661,3 +684,5 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { return &ggml_backend_cpu_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dd94ab03d..a66b3a9f3 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) { + static std::vector features = []() { + std::vector features; + #define _STRINGIFY(...) #__VA_ARGS__ + #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__) + + #ifdef __CUDA_ARCH_LIST__ + features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) }); + #endif + + #ifdef GGML_CUDA_FORCE_MMQ + features.push_back({ "FORCE_MMQ", "1" }); + #endif + + #ifdef GGML_CUDA_FORCE_CUBLAS + features.push_back({ "FORCE_CUBLAS", "1" }); + #endif + + #ifdef GGML_CUDA_NO_VMM + features.push_back({ "NO_VMM", "1" }); + #endif + + #ifdef GGML_CUDA_NO_PEER_COPY + features.push_back({ "NO_PEER_COPY", "1" }); + #endif + + #ifdef GGML_CUDA_F16 + features.push_back({ "F16", "1" }); + #endif + + #ifdef GGML_CUDA_USE_GRAPHS + features.push_back({ "USE_GRAPHS", "1" }); + #endif + + #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) }); + #endif + + #ifdef GGML_CUDA_FA_ALL_QUANTS + features.push_back({ "FA_ALL_QUANTS", "1" }); + #endif + + #undef _STRINGIFY + #undef STRINGIFY + + features.push_back({ nullptr, nullptr }); + + return features; + }(); + + return features.data(); + + GGML_UNUSED(reg); +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { return (void *)ggml_backend_cuda_unregister_host_buffer; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cuda_get_features; + } return nullptr; } @@ -3209,3 +3267,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg) diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 2fea9e4cc..c2a7b6afa 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2182,3 +2182,5 @@ ggml_backend_reg_t ggml_backend_kompute_reg() { return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d1abb3cef..1b1967f94 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4396,3 +4396,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { return &g_ggml_backend_metal_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 47357daab..31c1313b9 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1401,3 +1401,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { return dev; } + +GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 255bc64c6..f36640fd6 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4678,3 +4678,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) { return sycl_backend; } +GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ca71da2f7..f9e5fabfa 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -7365,3 +7365,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 78e7874de..1a2318cb1 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04..d49d14dee 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(vdot) + if (NOT GGML_BACKEND_DL) + add_subdirectory(vdot) + endif() endif() diff --git a/src/llama.cpp b/src/llama.cpp index 001711037..88a802dfb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4866,7 +4866,9 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); mmaps_used.emplace_back(mapping->size, 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); @@ -9190,7 +9192,7 @@ static bool llm_load_tensors( ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device - dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute( int n_threads, ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(lctx.backend_cpu, threadpool); } // set the number of threads for all the backends @@ -19478,7 +19481,11 @@ void llama_backend_init(void) { void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { - ggml_numa_init(numa); + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + GGML_ASSERT(dev && "CPU backend is not loaded"); + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); + numa_init_fn(numa); } } @@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; - ctx->logits_all = params.logits_all; // build worst-case graph for encoder if a model contains encoder @@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model( } // add CPU backend - ctx->backend_cpu = ggml_backend_cpu_init(); + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); llama_free(ctx); @@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model( } } + llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); @@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model( std::vector backend_ptrs; for (auto & backend : ctx->backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : ctx->backends) { - if (ggml_backend_is_cpu(backend.get())) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend continue; } @@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; + + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + } + } } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { - ggml_cpu_init(); // some ARM features are detected at runtime - static std::string s; - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; - s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; + for (int i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b06f122e8..82373ff4e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU # llama_target_and_test(test-double-float.cpp) # SLOW llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) -llama_target_and_test(test-quantize-fns.cpp) -llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-llama-grammar.cpp) -llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) -llama_target_and_test(test-rope.cpp) - llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") +if (NOT GGML_BACKEND_DL) + # these tests use the backends directly and cannot be built with dynamic loading + llama_target_and_test(test-barrier.cpp) + llama_target_and_test(test-quantize-fns.cpp) + llama_target_and_test(test-quantize-perf.cpp) + llama_target_and_test(test-rope.cpp) +endif() + # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b2b570524..6376b0e4c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,7 +16,6 @@ #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -639,19 +637,20 @@ struct test_case { // determine number of runs int n_runs; + bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU; if (op_flops(out) > 0) { // based on flops const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP; - uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; + uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; } else { // based on memory size const size_t GB = 1ULL << 30; const size_t target_size_cpu = 8 * GB; const size_t target_size_gpu = 32 * GB; - size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; + size_t target_size = is_cpu ? target_size_cpu : target_size_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; } @@ -3873,7 +3872,11 @@ static std::vector> make_test_cases_perf() { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { if (mode == MODE_TEST) { auto test_cases = make_test_cases_eval(); - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (backend_cpu == NULL) { + printf(" Failed to initialize CPU backend\n"); + return false; + } size_t n_ok = 0; for (auto & test : test_cases) { @@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) { } } - // enumerate backends + // load and enumerate backends + ggml_backend_load_all(); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; @@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) { continue; } - ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); - GGML_ASSERT(backend != NULL); - - if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { + if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); - ggml_backend_free(backend); n_ok++; continue; } + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); + GGML_ASSERT(backend != NULL); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) {