llama : offload to RPC in addition to other backends (#7640)

* llama : offload to RPC in addition to other backends

* - fix copy_tensor being called on the src buffer instead of the dst buffer

- always initialize views in the view_src buffer

- add RPC backend to Makefile build

- add endpoint to all RPC object names

* add rpc-server to Makefile

* Update llama.cpp

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Radoslav Gerganov 2024-06-03 20:03:26 +03:00 committed by GitHub
parent a5735e4426
commit bde7cd3cd9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 86 additions and 53 deletions

View File

@ -69,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
endif endif
endif endif
ifdef LLAMA_RPC
BUILD_TARGETS += rpc-server
endif
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
test: $(TEST_TARGETS) test: $(TEST_TARGETS)
@ -429,6 +433,11 @@ ifdef LLAMA_BLIS
MK_LDFLAGS += -lblis -L/usr/local/lib MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS endif # LLAMA_BLIS
ifdef LLAMA_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJS += ggml-rpc.o
endif # LLAMA_RPC
ifdef LLAMA_CUBLAS ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future # LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1 LLAMA_CUDA := 1
@ -654,11 +663,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif endif
endif # LLAMA_METAL endif # LLAMA_METAL
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif endif
ifdef LLAMA_RPC
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # LLAMA_RPC
GF_CC := $(CC) GF_CC := $(CC)
include scripts/get-flags.mk include scripts/get-flags.mk
@ -738,14 +762,9 @@ unicode.o: unicode.cpp unicode.h
unicode-data.o: unicode-data.cpp unicode-data.h unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
common.o: common/common.cpp $(COMMON_H_DEPS) common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@

View File

@ -750,7 +750,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
// this tensor was allocated without ggml-backend // this tensor was allocated without ggml-backend
return; return;
} }
ggml_backend_view_init(galloc->buffers[buffer_id], tensor); ggml_backend_view_init(tensor);
} }
} else { } else {
if (tensor->data == NULL) { if (tensor->data == NULL) {
@ -899,12 +899,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
if (t->view_src == NULL) { if (t->view_src == NULL) {
ggml_tallocr_alloc(&tallocr, t); ggml_tallocr_alloc(&tallocr, t);
} else if (t->buffer == NULL) { } else if (t->buffer == NULL) {
ggml_backend_view_init(buffer, t); ggml_backend_view_init(t);
} }
} else { } else {
if (t->view_src != NULL && t->buffer == NULL) { if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor // view of a pre-allocated tensor
ggml_backend_view_init(buffer, t); ggml_backend_view_init(t);
} }
} }
} }

View File

@ -151,7 +151,7 @@ void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
if (dst_buf->iface.cpy_tensor) { if (dst_buf->iface.cpy_tensor) {
return src->buffer->iface.cpy_tensor(dst_buf, src, dst); return dst_buf->iface.cpy_tensor(dst_buf, src, dst);
} }
return false; return false;
} }
@ -1887,15 +1887,15 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
// utils // utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { void ggml_backend_view_init(struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL); GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL); GGML_ASSERT(tensor->view_src->data != NULL);
tensor->buffer = buffer; tensor->buffer = tensor->view_src->buffer;
tensor->data = (char *)tensor->view_src->data + tensor->view_offs; tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
ggml_backend_buffer_init_tensor(buffer, tensor); ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
} }
void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) { void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
@ -1954,7 +1954,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
struct ggml_tensor * dst = node_copies[id]; struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) { if (dst->view_src != NULL) {
graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst); ggml_backend_view_init(dst);
} }
else { else {
ggml_backend_tensor_copy(src, dst); ggml_backend_tensor_copy(src, dst);

View File

@ -225,7 +225,7 @@ extern "C" {
// Tensor initialization // Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -491,7 +491,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer
if (remote_ptr != 0) { if (remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft, ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface, ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC"}, new ggml_backend_rpc_buffer_context{sock, {}, remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
remote_size); remote_size);
return buffer; return buffer;
} else { } else {
@ -692,7 +692,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context { ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
/* .endpoint = */ endpoint, /* .endpoint = */ endpoint,
/* .name = */ "RPC", /* .name = */ "RPC[" + std::string(endpoint) + "]",
}; };
ggml_backend_t backend = new ggml_backend { ggml_backend_t backend = new ggml_backend {

View File

@ -2371,13 +2371,34 @@ struct llama_context {
struct llama_control_vector cvec; struct llama_control_vector cvec;
}; };
static size_t llama_get_device_count(const llama_model & model) {
size_t count = 1;
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#endif
#if defined(GGML_USE_RPC)
count += model.rpc_servers.size();
#endif
return count;
GGML_UNUSED(model);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr; ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC #if defined(GGML_USE_RPC)
std::string endpoint = model.rpc_servers[gpu]; int dev_count = (int)llama_get_device_count(model);
buft = ggml_backend_rpc_buffer_type(endpoint.c_str()); int rpc_count = (int)model.rpc_servers.size();
#elif defined(GGML_USE_METAL) if (gpu >= dev_count - rpc_count) {
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type(); buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA) #elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu); buft = ggml_backend_cuda_buffer_type(gpu);
@ -2425,29 +2446,19 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
GGML_UNUSED(tensor_split); GGML_UNUSED(tensor_split);
} }
static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}
static size_t llama_get_device_memory(const llama_model & model, int device) { static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC) #if defined(GGML_USE_RPC)
size_t total; int dev_count = (int)llama_get_device_count(model);
size_t free; int rpc_count = (int)model.rpc_servers.size();
std::string endpoint = model.rpc_servers[device]; if (device >= dev_count - rpc_count) {
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total); size_t total;
return free; size_t free;
#elif defined(GGML_USE_CUDA) const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
#endif
#if defined(GGML_USE_CUDA)
size_t total; size_t total;
size_t free; size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total); ggml_backend_cuda_get_device_memory(device, &free, &total);
@ -16160,7 +16171,7 @@ struct llama_model * llama_load_model_from_file(
return true; return true;
}; };
} }
if (params.rpc_servers != nullptr) { if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
// split the servers set them into model->rpc_servers // split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers); std::string servers(params.rpc_servers);
size_t pos = 0; size_t pos = 0;
@ -16323,17 +16334,7 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
// initialize backends // initialize backends
#if defined(GGML_USE_RPC) #if defined(GGML_USE_METAL)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) { if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init(); ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) { if (ctx->backend_metal == nullptr) {
@ -16425,6 +16426,19 @@ struct llama_context * llama_new_context_with_model(
} }
ctx->backends.push_back(backend); ctx->backends.push_back(backend);
} }
#endif
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & endpoint : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
#endif #endif
ctx->backend_cpu = ggml_backend_cpu_init(); ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) { if (ctx->backend_cpu == nullptr) {