From f578b86b2123d0f92afbaa98a031df4d4464e582 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 13 Jun 2024 03:11:35 +0200 Subject: [PATCH] move BLAS to a separate backend (#6210) * move BLAS to a separate backend * rename GGML_USE_OPENBLAS to GGML_USE_BLAS * alloc : reuse same buffer when the same buffer type if used multiple times * set number of threads automatically for openblas and blis * sched : print assignments when GGML_SCHED_DEBUG env variable is set * sched : allow ops with weights on an incompatible buffer type This will cause the weight to be copied to a backend that supports the op, which is very costly. The weight should have been stored in a buffer of a backend that can run the op, but llama.cpp cannot do this automatically at the moment. --------- Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 23 +- Makefile | 27 +- examples/llama-bench/llama-bench.cpp | 1 + ggml-alloc.c | 98 ++++++-- ggml-backend-impl.h | 28 ++- ggml-backend.c | 242 +++++++++++++----- ggml-backend.h | 6 +- ggml-blas.cpp | 363 +++++++++++++++++++++++++++ ggml-blas.h | 23 ++ ggml-cuda.cu | 44 ++-- ggml-kompute.cpp | 13 +- ggml-metal.m | 15 +- ggml-rpc.cpp | 21 +- ggml-sycl.cpp | 28 +-- ggml-vulkan.cpp | 26 +- ggml.c | 205 ++------------- llama.cpp | 37 ++- 17 files changed, 821 insertions(+), 379 deletions(-) create mode 100644 ggml-blas.cpp create mode 100644 ggml-blas.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e280f87d..08481334f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,8 +39,12 @@ endif() if (APPLE) set(LLAMA_METAL_DEFAULT ON) + set(LLAMA_BLAS_DEFAULT ON) + set(LLAMA_BLAS_VENDOR_DEFAULT "Apple") else() set(LLAMA_METAL_DEFAULT OFF) + set(LLAMA_BLAS_DEFAULT OFF) + set(LLAMA_BLAS_VENDOR_DEFAULT "Generic") endif() set(LLAMA_LLAMAFILE_DEFAULT ON) @@ -91,9 +95,10 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) -option(LLAMA_BLAS "llama: use BLAS" OFF) +option(LLAMA_BLAS "llama: use BLAS" ${LLAMA_BLAS_DEFAULT}) +set(LLAMA_BLAS_VENDOR ${LLAMA_BLAS_VENDOR_DEFAULT} CACHE STRING + "llama: BLAS library vendor") option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) -set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUDA "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) @@ -311,9 +316,9 @@ if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) endif() - if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) - set(BLA_SIZEOF_INTEGER 8) - endif() + #if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22) + # set(BLA_SIZEOF_INTEGER 8) + #endif() set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) find_package(BLAS) @@ -321,7 +326,7 @@ if (LLAMA_BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - if ("${BLAS_INCLUDE_DIRS}" STREQUAL "") + if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${LLAMA_BLAS_VENDOR} MATCHES "Apple")) # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268 find_package(PkgConfig REQUIRED) @@ -374,12 +379,15 @@ if (LLAMA_BLAS) add_compile_options(${BLAS_LINKER_FLAGS}) - add_compile_definitions(GGML_USE_OPENBLAS) + add_compile_definitions(GGML_USE_BLAS) if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel")) add_compile_definitions(GGML_BLAS_USE_MKL) endif() + set(GGML_HEADERS_BLAS ggml-blas.h) + set(GGML_SOURCES_BLAS ggml-blas.cpp) + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) else() @@ -1258,6 +1266,7 @@ add_library(ggml OBJECT ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) diff --git a/Makefile b/Makefile index a4cab1bb2..744fe5739 100644 --- a/Makefile +++ b/Makefile @@ -440,10 +440,11 @@ ifndef LLAMA_NO_ACCELERATE # Mac OS - include Accelerate framework. # `-framework Accelerate` works both with Apple Silicon and Mac Intel ifeq ($(UNAME_S),Darwin) - MK_CPPFLAGS += -DGGML_USE_ACCELERATE + MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64 MK_LDFLAGS += -framework Accelerate + OBJS += ggml-blas.o endif endif # LLAMA_NO_ACCELERATE @@ -454,21 +455,30 @@ ifndef LLAMA_NO_OPENMP endif # LLAMA_NO_OPENMP ifdef LLAMA_OPENBLAS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_LDFLAGS += $(shell pkg-config --libs openblas) + OBJS += ggml-blas.o endif # LLAMA_OPENBLAS +ifdef LLAMA_OPENBLAS64 + MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64) + MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64) + MK_LDFLAGS += $(shell pkg-config --libs openblas64) + OBJS += ggml-blas.o +endif # LLAMA_OPENBLAS64 + +ifdef LLAMA_BLIS + MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis + MK_LDFLAGS += -lblis -L/usr/local/lib + OBJS += ggml-blas.o +endif # LLAMA_BLIS + ifndef LLAMA_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_LLAMAFILE OBJS += sgemm.o endif -ifdef LLAMA_BLIS - MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis - MK_LDFLAGS += -lblis -L/usr/local/lib -endif # LLAMA_BLIS - ifdef LLAMA_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJS += ggml-rpc.o @@ -776,6 +786,9 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ +ggml-blas.o: ggml-blas.cpp ggml-blas.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 61f5a5a09..61dd1d71a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -293,6 +293,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.output_format = cmd_params_defaults.output_format; params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.reps = cmd_params_defaults.reps; + params.numa = cmd_params_defaults.numa; for (int i = 1; i < argc; i++) { arg = argv[i]; diff --git a/ggml-alloc.c b/ggml-alloc.c index eb75962d4..bd367c42d 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -339,6 +339,7 @@ struct hash_node { }; struct tensor_alloc { + int buffer_id; size_t offset; size_t size_max; // 0 = pre-allocated, unused, or view }; @@ -349,7 +350,6 @@ struct leaf_alloc { }; struct node_alloc { - int buffer_id; struct tensor_alloc dst; struct tensor_alloc src[GGML_MAX_SRC]; }; @@ -386,8 +386,19 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs for (int i = 0; i < n_bufs; i++) { galloc->bufts[i] = bufts[i]; galloc->buffers[i] = NULL; - size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); - galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + + // check if the same buffer type is used multiple times and reuse the same allocator + for (int j = 0; j < i; j++) { + if (bufts[i] == bufts[j]) { + galloc->buf_tallocs[i] = galloc->buf_tallocs[j]; + break; + } + } + + if (galloc->buf_tallocs[i] == NULL) { + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } } galloc->n_buffers = n_bufs; @@ -405,10 +416,30 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { for (int i = 0; i < galloc->n_buffers; i++) { if (galloc->buffers != NULL) { - ggml_backend_buffer_free(galloc->buffers[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buffers[j] == galloc->buffers[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_backend_buffer_free(galloc->buffers[i]); + } } if (galloc->buf_tallocs != NULL) { - ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + // skip if already freed + bool freed = false; + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + freed = true; + break; + } + } + if (!freed) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } } @@ -511,17 +542,18 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor } } -static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { // graph outputs are never freed if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { AT_PRINTF("not freeing output %s\n", node->name); return; } - struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; - ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); size_t offset = hn->offset; + int buffer_id = hn->buffer_id; + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; size_t size = ggml_backend_buft_get_alloc_size(buft, node); ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); hn->allocated = false; @@ -626,11 +658,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { - ggml_gallocr_free_node(galloc, view_src, buffer_id); + ggml_gallocr_free_node(galloc, view_src); } } else if (p_hn->allocated) { - ggml_gallocr_free_node(galloc, parent, buffer_id); + ggml_gallocr_free_node(galloc, parent); } } AT_PRINTF("\n"); @@ -674,22 +706,25 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); if (node->view_src || node->data) { + node_alloc->dst.buffer_id = -1; node_alloc->dst.offset = SIZE_MAX; node_alloc->dst.size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - node_alloc->dst.offset = hn->offset; - node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + node_alloc->dst.buffer_id = hn->buffer_id; + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (!src || src->view_src || src->data) { + node_alloc->src[j].buffer_id = -1; node_alloc->src[j].offset = SIZE_MAX; node_alloc->src[j].size_max = 0; } else { struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].buffer_id = hn->buffer_id; node_alloc->src[j].offset = hn->offset; node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); } @@ -706,9 +741,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); galloc->leaf_allocs[i].buffer_id = hn->buffer_id; if (leaf->view_src || leaf->data) { + galloc->leaf_allocs[i].leaf.buffer_id = -1; galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; galloc->leaf_allocs[i].leaf.size_max = 0; } else { + galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id; galloc->leaf_allocs[i].leaf.offset = hn->offset; galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); } @@ -716,6 +753,14 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // reallocate buffers if needed for (int i = 0; i < galloc->n_buffers; i++) { + // if the buffer type is used multiple times, we reuse the same buffer + for (int j = 0; j < i; j++) { + if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) { + galloc->buffers[i] = galloc->buffers[j]; + break; + } + } + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); @@ -724,6 +769,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c #ifndef NDEBUG fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif + ggml_backend_buffer_free(galloc->buffers[i]); galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); if (galloc->buffers[i] == NULL) { @@ -740,7 +786,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } -static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) { +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) { + int buffer_id = tensor_alloc->buffer_id; assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max); if (tensor->view_src != NULL) { @@ -768,8 +815,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } } -static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); return talloc->size_max >= node_size; } @@ -793,7 +840,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph struct ggml_tensor * node = graph->nodes[i]; struct node_alloc * node_alloc = &galloc->node_allocs[i]; - if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { + if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) { #ifndef NDEBUG fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); #endif @@ -805,7 +852,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph if (src == NULL) { continue; } - if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { + if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) { #ifndef NDEBUG fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); #endif @@ -846,7 +893,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i]; - ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf); + ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf); } // nodes for (int i = 0; i < graph->n_nodes; i++) { @@ -857,9 +904,9 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) if (src == NULL) { continue; } - ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]); + ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]); } - ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst); + ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst); } return true; @@ -871,6 +918,15 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { if (galloc->buffers[buffer_id] == NULL) { return 0; } + + for (int i = 0; i < buffer_id; i++) { + if (galloc->buffers[i] == galloc->buffers[buffer_id]) { + // this buffer is the same as a previous one due to the same buffer type being used multiple times + // only return the buffer size the first time it appears to avoid double counting + return 0; + } + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f121e1de4..36ca37086 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -17,13 +17,15 @@ extern "C" { struct ggml_backend_buffer_type_i { const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft); + // allocate a buffer of this type ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); - size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size - size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding - bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // tensor alignment + size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); + // max buffer size that can be allocated + size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); + // data size needed to allocate the tensor, including padding + size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // check if tensor data is in host memory - // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft); }; @@ -92,27 +94,37 @@ extern "C" { void (*GGML_CALL synchronize)(ggml_backend_t backend); // compute graph with a plan (not used currently) + // create a new plan for a graph ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology + void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); + // compute the graph with the plan + enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph with a plan - enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); // compute graph without a plan (async) enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); - // check if the backend supports an operation + // check if the backend can compute an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend can use tensors allocated in a buffer type + bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // these should be expensive operations with large batch sizes that may benefit from running on this backend // even if the weight has to be copied from the CPU temporarily bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); // (optional) event synchronization + // create a new event that can record events on this backend instance ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); + // record an event on the backend instance that created it void (*GGML_CALL event_record) (ggml_backend_event_t event); + // wait for an event on on a different backend instance void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event); + // block until an event is recorded void (*GGML_CALL event_synchronize) (ggml_backend_event_t event); }; diff --git a/ggml-backend.c b/ggml-backend.c index 05737ed69..2bec7bea3 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -44,10 +44,6 @@ GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buf return ggml_nbytes(tensor); } -bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return buft->iface.supports_backend(buft, backend); -} - bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { if (buft->iface.is_host) { return buft->iface.is_host(buft); @@ -286,6 +282,10 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return backend->iface.supports_buft(backend, buft); +} + bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { if (backend->iface.offload_op != NULL) { return backend->iface.offload_op(backend, op); @@ -639,12 +639,6 @@ GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_ GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cpu(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -659,7 +653,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -715,7 +708,6 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, @@ -836,6 +828,12 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + static struct ggml_backend_i cpu_backend_i = { /* .get_name = */ ggml_backend_cpu_name, /* .free = */ ggml_backend_cpu_free, @@ -846,9 +844,11 @@ static struct ggml_backend_i cpu_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .supports_buft = */ ggml_backend_cpu_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, @@ -1055,6 +1055,9 @@ struct ggml_backend_sched { int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] + int * prev_node_backend_ids; // [graph_size] + int * prev_leaf_backend_ids; // [graph_size] + // copy of the graph with modified inputs struct ggml_cgraph * graph; @@ -1075,6 +1078,8 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; + bool debug; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) @@ -1097,22 +1102,24 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen return -1; } -static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) { ggml_backend_buffer_t buffer = tensor->buffer; if (buffer == NULL) { return -1; } - // find highest prio backend that supports the buffer type + // find highest prio backend that supports the buffer type and the op for (int i = 0; i < sched->n_backends; i++) { - if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { + if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) && + ggml_backend_supports_op(sched->backends[i], op)) { return i; } } - fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n", - __func__, ggml_backend_buffer_name(buffer), tensor->name); - GGML_ASSERT(false); +#ifndef NDEBUG + fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n", + __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name); +#endif return -1; } @@ -1131,7 +1138,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); return cur_backend_id; @@ -1139,7 +1146,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // view_src if (tensor->view_src != NULL) { - cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor); if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); return cur_backend_id; @@ -1161,7 +1168,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op if (src_backend_id == sched->n_backends - 1) { for (int b = 0; b < src_backend_id; b++) { @@ -1223,10 +1230,33 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } } -//#define DEBUG_PASS1 -//#define DEBUG_PASS2 -//#define DEBUG_PASS3 -//#define DEBUG_PASS4 +static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) { + ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer; + ggml_backend_buffer_type_t buft = NULL; + + if (buf) { + // the tensor is already allocated + buft = buf->buft; + } else { + // see if the tensor already has a backend assigned, and use the buffer type of that backend + int tensor_backend_id = tensor_backend_id(t); + if (tensor_backend_id == -1 && t->view_src) { + tensor_backend_id = tensor_backend_id(t->view_src); + } + if (tensor_backend_id != -1) { + buft = sched->bufts[tensor_backend_id]; + } + } + + return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft); +} + +static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) { + if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) { + *node_backend_id = cur_backend_id; + SET_CAUSE(node, "2.sup"); + } +} // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { @@ -1280,17 +1310,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } } -#ifdef DEBUG_PASS1 - fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 2: expand current backend assignments // assign the same backend to adjacent nodes // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops - - - // pass 2.2 expand gpu down + // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known + // expand gpu down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1306,13 +1332,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.2"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.1 expand gpu up + // expand gpu up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1328,13 +1353,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } else { cur_backend_id = *node_backend_id; } - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.1"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.4 expand rest down + // expand rest down { int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { @@ -1345,13 +1369,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.4"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } - // pass 2.3 expand rest up + // expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1362,24 +1385,80 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg int * node_backend_id = &tensor_backend_id(node); if (*node_backend_id != -1) { cur_backend_id = *node_backend_id; - } else { - *node_backend_id = cur_backend_id; - SET_CAUSE(node, "2.3"); + } else if (cur_backend_id != -1) { + ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id); } } } -#ifdef DEBUG_PASS2 - fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif + // pass 3: upgrade nodes to higher prio backends with compatible buffer types + // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there + // however, we also need to verify that the sources are in compatible buffer types + // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph + // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same + // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU) + // additionally, set remaining unassigned nodes to the backend with the most supported inputs + // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id == -1) { + // unassigned node: find the backend with the most supported inputs + int n_supported_best = -1; + for (int b = 0; b < sched->n_backends; b++) { + if (ggml_backend_supports_op(sched->backends[b], node)) { + int n_supported = 0; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) { + n_supported++; + } + } + if (n_supported > n_supported_best) { + n_supported_best = n_supported; + *node_backend_id = b; + SET_CAUSE(node, "3.best"); + } + } + } + } else { + // assigned node: upgrade to higher prio backend if possible + for (int b = 0; b < *node_backend_id; b++) { + if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) { + bool supported = true; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + if (!ggml_backend_sched_buffer_supported(sched, src, b)) { + supported = false; + break; + } + } + if (supported) { + *node_backend_id = b; + SET_CAUSE(node, "3.upg"); + break; + } + } + } + } + } - // pass 3: assign backends to remaining src from dst and view_src + // pass 4: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * cur_backend_id = &tensor_backend_id(node); if (node->view_src != NULL && *cur_backend_id == -1) { *cur_backend_id = tensor_backend_id(node->view_src); - SET_CAUSE(node, "3.vsrc"); + SET_CAUSE(node, "4.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -1391,17 +1470,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src->view_src != NULL) { // views are always on the same backend as the source *src_backend_id = tensor_backend_id(src->view_src); - SET_CAUSE(src, "3.vsrc"); + SET_CAUSE(src, "4.vsrc"); } else { *src_backend_id = *cur_backend_id; - SET_CAUSE(src, "3.cur"); + SET_CAUSE(src, "4.cur"); } } } } -#ifdef DEBUG_PASS3 - fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif // pass 4: split graph, find tensors that need to be copied { @@ -1448,10 +1524,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } // check if the split has too many inputs + // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); int src_backend_id = sched->tensor_backend_id[id]; - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1486,7 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1511,7 +1589,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != node_backend_id) { + bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); + if (src_backend_id != cur_backend_id && !supported) { // create a copy of the input in the split's backend const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { @@ -1537,9 +1616,21 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg split->i_end = graph->n_nodes; sched->n_splits = i_split + 1; } -#ifdef DEBUG_PASS4 - fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); -#endif + + if (sched->debug) { + ggml_backend_sched_print_assignments(sched, graph); + } + + // swap node_backend_ids and leaf_backend_ids and prevs + { + int * tmp = sched->node_backend_ids; + sched->node_backend_ids = sched->prev_node_backend_ids; + sched->prev_node_backend_ids = tmp; + + tmp = sched->leaf_backend_ids; + sched->leaf_backend_ids = sched->prev_leaf_backend_ids; + sched->prev_leaf_backend_ids = tmp; + } // create copies of the graph for each split // TODO: avoid this copy @@ -1613,8 +1704,24 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { + bool backend_ids_changed = false; + for (int i = 0; i < sched->graph->n_nodes; i++) { + if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + if (!backend_ids_changed) { + for (int i = 0; i < sched->graph->n_leafs; i++) { + if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) { + backend_ids_changed = true; + break; + } + } + } + // allocate graph - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG @@ -1727,6 +1834,8 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); + sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); @@ -1735,6 +1844,8 @@ ggml_backend_sched_t ggml_backend_sched_new( const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); + sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->n_backends = n_backends; @@ -1747,7 +1858,7 @@ ggml_backend_sched_t ggml_backend_sched_new( for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]); - GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b])); + GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b])); if (sched->n_copies > 1) { for (int c = 0; c < sched->n_copies; c++) { sched->events[b][c] = ggml_backend_event_new(backends[b]); @@ -1779,6 +1890,8 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { free(sched->tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); + free(sched->prev_node_backend_ids); + free(sched->prev_leaf_backend_ids); free(sched); } @@ -1875,6 +1988,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; + SET_CAUSE(node, "usr"); } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { diff --git a/ggml-backend.h b/ggml-backend.h index c582b0685..47fd81475 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -23,7 +23,6 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer @@ -74,6 +73,7 @@ extern "C" { GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends @@ -90,7 +90,7 @@ extern "C" { GGML_API void ggml_backend_event_free (ggml_backend_event_t event); GGML_API void ggml_backend_event_record (ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); - GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event + GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // // CPU backend @@ -119,7 +119,7 @@ extern "C" { GGML_API size_t ggml_backend_reg_get_count(void); GGML_API size_t ggml_backend_reg_find_by_name(const char * name); - GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params] + GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional) GGML_API const char * ggml_backend_reg_get_name(size_t i); GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i); diff --git a/ggml-blas.cpp b/ggml-blas.cpp new file mode 100644 index 000000000..d709a357b --- /dev/null +++ b/ggml-blas.cpp @@ -0,0 +1,363 @@ +#include "ggml-blas.h" +#include "ggml-backend-impl.h" + +#include +#include + +#if defined(GGML_USE_ACCELERATE) +# include +#elif defined(GGML_BLAS_USE_MKL) +# include +#else +# include +# ifdef BLIS_ENABLE_CBLAS +# include +# endif +#endif + +struct ggml_backend_blas_context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; +#ifndef GGML_USE_OPENMP + std::vector> tasks; +#endif +}; + +// helper function to determine if it is better to use BLAS or not +// for large matrices, BLAS is faster +static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if (ggml_is_contiguous(src0) && + ggml_is_contiguous(src1) && + src1->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ + return true; + } + + return false; +} + +static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type = src0->type; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const int64_t ne_plane = ne01*ne00; + const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float); + + if (ctx->work_size < desired_wsize) { + ctx->work_data.reset(new char[desired_wsize]); + ctx->work_size = desired_wsize; + } + void * wdata = ctx->work_data.get(); + + // convert src0 to float + if (type != GGML_TYPE_F32) { + ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type); + ggml_to_float_t const to_float = type_traits.to_float; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const void * x = (char *) src0->data + i02*nb02 + i03*nb03; + float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + + const int min_cols_per_thread = 4096; + const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1); + const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1); + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for num_threads(n_threads) + for (int64_t i01 = 0; i01 < ne01; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } +#else + for (int i = 1; i < n_threads; i++) { + const int64_t start = i*ne01/n_threads; + const int64_t end = (i + 1)*ne01/n_threads; + if (start < end) { + ctx->tasks.push_back(std::async(std::launch::async, [=]() { + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + })); + } + } + { + // reuse the current thread for the first task + const int64_t start = 0; + const int64_t end = ne01/n_threads; + for (int64_t i01 = start; i01 < end; i01++) { + to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00); + } + } +#endif + } + } + +#ifndef GGML_USE_OPENMP + // wait for all tasks to finish + for (auto & task : ctx->tasks) { + task.get(); + } + ctx->tasks.clear(); +#endif + } + +#if defined(OPENBLAS_VERSION) + openblas_set_num_threads(ctx->n_threads); +#endif + +#if defined(BLIS_ENABLE_CBLAS) + bli_thread_set_num_threads(ctx->n_threads); +#endif + + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const int64_t i03 = i13/r3; + const int64_t i02 = i12/r2; + + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + + if (type != GGML_TYPE_F32) { + x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane; + } + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne1, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } +} + +static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(ne0 == ne00); + GGML_ASSERT(ne1 == ne10); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne03 == ne13); + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + // GGML_ASSERT(nb0 <= nb1); + // GGML_ASSERT(nb1 <= nb2); + // GGML_ASSERT(nb2 <= nb3); + + // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) + // src0: (k,n) + // src1: (k,m) + // dst: (m,n) + // + // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) + // Also expressed as (major,minor) + // a: (m,k): so src1 transposed + // b: (k,n): so src0 + // c: (m,n) + // + // However, if ggml_is_transposed(src1) is true, then + // src1->data already contains a transposed version, so sgemm mustn't + // transpose it further. + + int n = src0->ne[0]; + int k = src0->ne[1]; + int m = src1->ne[0]; + + CBLAS_TRANSPOSE transposeA; + int lda; + + if (!ggml_is_transposed(src1)) { + transposeA = CblasTrans; + lda = m; + } else { + transposeA = CblasNoTrans; + lda = k; + } + + float * a = (float *) ((char *) src1->data); + float * b = (float *) ((char *) src0->data); + float * c = (float *) ((char *) dst->data); + + cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); + + GGML_UNUSED(ctx); +} + +// backend interface + +GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) { + return "BLAS"; + + GGML_UNUSED(backend); +} + +GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + delete ctx; + delete backend; +} + +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(backend); +} + +GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_backend_blas_mul_mat(ctx, node); + break; + + case GGML_OP_OUT_PROD: + ggml_backend_blas_out_prod(ctx, node); + break; + + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + GGML_ASSERT(false); + } + } + + return GGML_STATUS_SUCCESS; + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) || + (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_matrix(src0) && + ggml_is_matrix(src1) && + ggml_is_contiguous(src0) && + (ggml_is_contiguous(src1) || ggml_is_transposed(src1))); + + GGML_UNUSED(backend); +} + +GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + + GGML_UNUSED(backend); +} + +static struct ggml_backend_i blas_backend_i = { + /* .get_name = */ ggml_backend_blas_name, + /* .free = */ ggml_backend_blas_free, + /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_blas_graph_compute, + /* .supports_op = */ ggml_backend_blas_supports_op, + /* .supports_buft = */ ggml_backend_blas_supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static ggml_guid_t ggml_backend_blas_guid(void) { + static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d }; + return &guid; +} + +ggml_backend_t ggml_backend_blas_init(void) { + ggml_backend_blas_context * ctx = new ggml_backend_blas_context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ ggml_backend_blas_guid(), + /* .interface = */ blas_backend_i, + /* .context = */ ctx, + }; + +#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP) + if (openblas_get_parallel() != OPENBLAS_OPENMP) { + fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__); + } +#endif + +#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP) + fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__); +#endif + + return backend; +} + +GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid()); +} + +void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) { + GGML_ASSERT(ggml_backend_is_blas(backend_blas)); + + ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context; + ctx->n_threads = n_threads; +} diff --git a/ggml-blas.h b/ggml-blas.h new file mode 100644 index 000000000..f2e37de06 --- /dev/null +++ b/ggml-blas.h @@ -0,0 +1,23 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void); + +GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations +GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c6bc3f64c..64d3b6747 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -543,6 +543,10 @@ GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_bu return ctx->name.c_str(); } +static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; @@ -585,24 +589,12 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backen GGML_UNUSED(buft); } -GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_cuda(backend)) { - return false; - } - - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - return buft_ctx->device == cuda_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -863,6 +855,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_back GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name; +} + GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -906,12 +902,6 @@ GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); - - GGML_UNUSED(buft); -} - GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -924,7 +914,6 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, }; @@ -1024,7 +1013,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -2879,6 +2867,20 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cuda_split(buft)) { + return true; + } + + if (ggml_backend_buft_is_cuda(buft)) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + return buft_ctx->device == cuda_ctx->device; + } + + return false; +} + GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; @@ -2951,9 +2953,11 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .synchronize = */ ggml_backend_cuda_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .supports_buft = */ ggml_backend_cuda_supports_buft, /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 18c6f4a10..ed5f2e349 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1902,18 +1902,12 @@ static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_ return ctx->max_alloc; } -static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - GGML_UNUSED(buft); - return ggml_backend_is_kompute(backend); -} - static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -1973,6 +1967,11 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc return ggml_vk_supports_op(op); } +static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(backend); + return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name; +} + static struct ggml_backend_i kompute_backend_i = { /* .get_name = */ ggml_backend_kompute_name, /* .free = */ ggml_backend_kompute_free, @@ -1983,9 +1982,11 @@ static struct ggml_backend_i kompute_backend_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .supports_buft = */ ggml_backend_kompute_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index b5c287347..ec9e95302 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -3044,12 +3044,6 @@ GGML_CALL static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend UNUSED(buft); } -GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return true; @@ -3064,7 +3058,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, @@ -3179,6 +3172,12 @@ GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, con return ggml_metal_supports_op(metal_ctx, op); } +GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name; + + UNUSED(backend); +} + static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, @@ -3189,9 +3188,11 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .supports_buft = */ ggml_backend_metal_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp index 679ce4f28..9b95193d3 100644 --- a/ggml-rpc.cpp +++ b/ggml-rpc.cpp @@ -540,22 +540,12 @@ GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend return ggml_nbytes(tensor); } -GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_rpc(backend)) { - return false; - } - ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; - ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; - return buft_ctx->endpoint == rpc_ctx->endpoint; -} - static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = { /* .get_name = */ ggml_backend_rpc_buffer_type_name, /* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_rpc_get_max_size, /* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -638,6 +628,15 @@ GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const return false; } +GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) { + return false; + } + ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context; + ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context; + return buft_ctx->endpoint == rpc_ctx->endpoint; +} + static ggml_backend_i ggml_backend_rpc_interface = { /* .get_name = */ ggml_backend_rpc_name, /* .free = */ ggml_backend_rpc_free, @@ -648,9 +647,11 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .synchronize = */ ggml_backend_rpc_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_rpc_graph_compute, /* .supports_op = */ ggml_backend_rpc_supports_op, + /* .supports_buft = */ ggml_backend_rpc_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index e7d260bd4..6f41ed272 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -16575,22 +16575,12 @@ GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backen UNUSED(buft); } -GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_sycl(backend)) { - return false; - } - ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - return buft_ctx->device == sycl_ctx->device; -} - static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { /* .get_name = */ ggml_backend_sycl_buffer_type_name, /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend, /* .is_host = */ nullptr, }; @@ -16942,12 +16932,6 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_ return total_size; } -GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_sycl(backend); - - UNUSED(buft); -} - GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -16960,7 +16944,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend, /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, }; @@ -17046,7 +17029,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -17311,6 +17293,14 @@ GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const GGML_UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) { + return false; + } + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + return buft_ctx->device == sycl_ctx->device; +} static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, @@ -17322,9 +17312,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .synchronize = */ ggml_backend_sycl_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .supports_buft = */ ggml_backend_sycl_supports_buft, /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5b9280491..e2d17a352 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -6142,24 +6142,12 @@ GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_ UNUSED(buft); } -GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_vk(backend)) { - return false; - } - - ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - - return buft_ctx->ctx->idx == ctx->idx; -} - static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .get_name = */ ggml_backend_vk_buffer_type_name, /* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend, /* .is_host = */ NULL, }; @@ -6235,7 +6223,6 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() { /* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment, /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, }, /* .context = */ nullptr, @@ -6551,6 +6538,17 @@ GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const g UNUSED(backend); } +GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) { + if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) { + return false; + } + + ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; + + return buft_ctx->ctx->idx == ctx->idx; +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -6562,9 +6560,11 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .synchronize = */ NULL, // ggml_backend_vk_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .supports_buft = */ ggml_backend_vk_supports_buft, /* .offload_op = */ ggml_backend_vk_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, diff --git a/ggml.c b/ggml.c index 2ea1d7677..d5d33c2ba 100644 --- a/ggml.c +++ b/ggml.c @@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #if defined(GGML_USE_ACCELERATE) #include -#elif defined(GGML_USE_OPENBLAS) -#if defined(GGML_BLAS_USE_MKL) -#include -#else -#include -#endif #endif // floating point type used to accumulate sums @@ -12179,39 +12173,6 @@ static void ggml_compute_forward_group_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) { - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float) - // all the experts for each batch element and the processing would become incredibly slow - // TODO: find the optimal values for these - if (dst->op != GGML_OP_MUL_MAT_ID && - ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - //src0->type == GGML_TYPE_F32 && - src1->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - static void ggml_compute_forward_mul_mat_one_chunk( const struct ggml_compute_params * params, struct ggml_tensor * dst, @@ -12349,73 +12310,6 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(dst)) { - const int64_t ne_plane = ne01*ne00; - const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); - UNUSED(desired_wsize); - - if (params->type == GGML_TASK_TYPE_INIT) { - if (type != GGML_TYPE_F32) { - assert(params->wsize >= desired_wsize); - // parallelize by src0 rows - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - // broadcast src0 into src1 across 2nd,3rd dimension - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - ggml_to_float_t const to_float = type_traits[type].to_float; - - for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00); - } - } - } - } - return; - } - - if (params->type == GGML_TASK_TYPE_FINALIZE) { - return; - } - - // perform sgemm, parallelization controlled by blas lib - if (ith != 0) { - return; - } - - //const int64_t tgemm0 = ggml_perf_time_us(); - for (int64_t i13 = 0; i13 < ne13; i13++) { - for (int64_t i12 = 0; i12 < ne12; i12++) { - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; - - const void * x = (char *) src0->data + i02*nb02 + i03*nb03; - const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13); - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - if (type != GGML_TYPE_F32) { - x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane; - } - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne1, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2); - - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; - } -#endif - #if GGML_USE_LLAMAFILE const bool src1_cont = ggml_is_contiguous(src1); @@ -12796,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - bool use_blas = ggml_is_matrix(src0) && - ggml_is_matrix(src1) && - ggml_is_contiguous(src0) && - (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); -#endif - if (params->type == GGML_TASK_TYPE_INIT) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst - if (use_blas) { - return; - } -#endif if (ith != 0) { return; } @@ -12820,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32( return; } -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (use_blas) { - if (params->ith != 0) { // All threads other than the first do no work. - return; - } - // Arguments to ggml_compute_forward_out_prod (expressed as major,minor) - // src0: (k,n) - // src1: (k,m) - // dst: (m,n) - // - // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f) - // Also expressed as (major,minor) - // a: (m,k): so src1 transposed - // b: (k,n): so src0 - // c: (m,n) - // - // However, if ggml_is_transposed(src1) is true, then - // src1->data already contains a transposed version, so sgemm mustn't - // transpose it further. - - int n = src0->ne[0]; - int k = src0->ne[1]; - int m = src1->ne[0]; - - int transposeA, lda; - - if (!ggml_is_transposed(src1)) { - transposeA = CblasTrans; - lda = m; - } else { - transposeA = CblasNoTrans; - lda = k; - } - - float * a = (float *) ((char *) src1->data); - float * b = (float *) ((char *) src0->data); - float * c = (float *) ((char *) dst->data); - - cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n); - - return; - } -#endif - // dst[:,:,:,:] = 0 // for i2,i3: // for i1: @@ -12993,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; @@ -13391,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + dequantize_row_q( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13434,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_fp16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); @@ -13477,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); - ggml_bf16_to_fp32_row( + assert(i01 >= 0 && i01 < ne01); + + ggml_bf16_to_fp32_row( (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); } @@ -13520,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32( const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + assert(i01 >= 0 && i01 < ne01); + ggml_vec_cpy_f32(nc, (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03)); @@ -18893,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: + case GGML_OP_CONT: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_ACC: @@ -18977,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ } break; case GGML_OP_SCALE: case GGML_OP_SET: - case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: @@ -19137,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput sched_yield(); } - * node_n = atomic_load(&state->shared->node_n); - if (* node_n != last_node_n) break; + *node_n = atomic_load(&state->shared->node_n); + if (*node_n != last_node_n) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19148,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) { // wait for other threads to finish - const int last_task_phase = * task_phase; + const int last_task_phase = *task_phase; while (true) { if (do_yield) { sched_yield(); } - * task_phase = atomic_load(&state->shared->node_task); - if (* task_phase != last_task_phase) break; + *task_phase = atomic_load(&state->shared->node_task); + if (*task_phase != last_task_phase) { + break; + } + #if defined(__SSE3__) // Tell the processor we're spinning. It's a processor hint for spinlocks. _mm_pause(); @@ -19356,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa { const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node)) { - if (node->src[0]->type != GGML_TYPE_F32) { - // here we need memory for fully dequantized matrix from src0 - // take into account that src0 can be broadcasted into src1[2,3] - cur = ggml_type_size(GGML_TYPE_F32) - * node->src[0]->ne[0]*node->src[0]->ne[1] - * node->src[1]->ne[2]*node->src[1]->ne[3]; - } - } else -#endif if (node->src[1]->type != vec_dot_type) { cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); } @@ -22664,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL) return 1; #else return 0; diff --git a/llama.cpp b/llama.cpp index 8b675ea99..225ea977f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,6 +21,10 @@ # include "ggml-kompute.h" #endif +#ifdef GGML_USE_BLAS +# include "ggml-blas.h" +#endif + #ifdef GGML_USE_METAL # include "ggml-metal.h" #endif @@ -2299,9 +2303,13 @@ struct llama_context { std::vector backends; #ifdef GGML_USE_METAL ggml_backend_t backend_metal = nullptr; +#endif +#ifdef GGML_USE_BLAS + ggml_backend_t backend_blas = nullptr; #endif ggml_backend_t backend_cpu = nullptr; + const llama_model & model; // key + value cache for the self attention @@ -11529,7 +11537,8 @@ static struct ggml_cgraph * llama_build_graph( if (batch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) && + (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) { ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); break; } @@ -12026,6 +12035,11 @@ static void llama_graph_compute( ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } +#ifdef GGML_USE_BLAS + if (lctx.backend_blas != nullptr) { + ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads); + } +#endif ggml_backend_sched_graph_compute_async(lctx.sched, gf); @@ -12248,17 +12262,6 @@ static int llama_decode_internal( } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well - // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering - // with the BLAS calls. need a better solution - // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is - // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. - if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { - n_threads = std::min(4, n_threads); - } - ggml_backend_sched_alloc_graph(lctx.sched, gf); llama_set_inputs(lctx, u_batch); @@ -16251,6 +16254,16 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(backend); } #endif + +#ifdef GGML_USE_BLAS + ctx->backend_blas = ggml_backend_blas_init(); + if (ctx->backend_blas == nullptr) { + LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__); + } else { + ctx->backends.push_back(ctx->backend_blas); + } +#endif + #if defined(GGML_USE_RPC) if (model->n_gpu_layers > 0) { for (const auto & endpoint : model->rpc_servers) {