mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
metal : reduce command encoding overhead (#9698)
* metal : reduce command encoding overhead ggml-ci * metal : add comments
This commit is contained in:
parent
a90484c6d9
commit
cad341d889
@ -204,13 +204,6 @@ static ggml_status compute_piter(
|
|||||||
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: enable GPU support when support for GGML_OP_SQRT is added
|
|
||||||
//#ifdef GGML_USE_METAL
|
|
||||||
// if (ggml_backend_is_metal(model.backend)) {
|
|
||||||
// ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
|
|
||||||
// }
|
|
||||||
//#endif
|
|
||||||
|
|
||||||
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
ggml_status res = ggml_backend_graph_compute(model.backend, gf);
|
||||||
if (res == GGML_STATUS_SUCCESS) {
|
if (res == GGML_STATUS_SUCCESS) {
|
||||||
auto extract_i = [](std::string prefix, std::string str) -> int {
|
auto extract_i = [](std::string prefix, std::string str) -> int {
|
||||||
|
@ -2444,12 +2444,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
if (ggml_backend_is_metal(ctx->backend)) {
|
|
||||||
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
|
@ -25,9 +25,6 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
|
||||||
#define GGML_METAL_MAX_BUFFERS 64
|
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
struct ggml_cgraph;
|
||||||
|
|
||||||
@ -48,8 +45,6 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
|||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -17025,12 +17025,6 @@ static void llama_graph_compute(
|
|||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
ggml_threadpool * threadpool) {
|
ggml_threadpool * threadpool) {
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||||
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||||
|
Loading…
Reference in New Issue
Block a user