Merge commit from fork

This commit is contained in:
Georgi Gerganov 2024-08-09 23:03:21 +03:00 committed by GitHub
parent 6afd1a99dc
commit b72942fac9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 53 additions and 3 deletions

View File

@ -1,5 +1,9 @@
## Overview ## Overview
> [!IMPORTANT]
> This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
> insecure. **Never run the RPC server on an open network or in a sensitive environment!**
The `rpc-server` allows running `ggml` backend on a remote host. The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way: This can be used for distributed LLM inference with `llama.cpp` in the following way:

View File

@ -16,7 +16,7 @@
#include <stdio.h> #include <stdio.h>
struct rpc_server_params { struct rpc_server_params {
std::string host = "0.0.0.0"; std::string host = "127.0.0.1";
int port = 50052; int port = 50052;
size_t backend_mem = 0; size_t backend_mem = 0;
}; };
@ -114,6 +114,17 @@ int main(int argc, char * argv[]) {
fprintf(stderr, "Invalid parameters\n"); fprintf(stderr, "Invalid parameters\n");
return 1; return 1;
} }
if (params.host != "127.0.0.1") {
fprintf(stderr, "\n");
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
fprintf(stderr, " Never expose the RPC server to an open network!\n");
fprintf(stderr, " This is an experimental feature and is not secure!\n");
fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
fprintf(stderr, "\n");
}
ggml_backend_t backend = create_backend(); ggml_backend_t backend = create_backend();
if (!backend) { if (!backend) {
fprintf(stderr, "Failed to create backend\n"); fprintf(stderr, "Failed to create backend\n");

View File

@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
fprintf(stderr, "Failed to set SO_REUSEADDR\n"); fprintf(stderr, "Failed to set SO_REUSEADDR\n");
return nullptr; return nullptr;
} }
if (inet_addr(host) == INADDR_NONE) {
fprintf(stderr, "Invalid host address: %s\n", host);
return nullptr;
}
struct sockaddr_in serv_addr; struct sockaddr_in serv_addr;
serv_addr.sin_family = AF_INET; serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = inet_addr(host); serv_addr.sin_addr.s_addr = inet_addr(host);
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
if (result->buffer && buffers.find(result->buffer) == buffers.end()) { if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
return nullptr; return nullptr;
} }
// require that the tensor data does not go beyond the buffer end
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
result->op = (ggml_op) tensor->op; result->op = (ggml_op) tensor->op;
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) { for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
result->op_params[i] = tensor->op_params[i]; result->op_params[i] = tensor->op_params[i];
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
const rpc_tensor * in_tensor = (const rpc_tensor *)input.data(); const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
uint64_t offset; uint64_t offset;
memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset)); memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset); const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
struct ggml_init_params params { struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead(), /*.mem_size =*/ ggml_tensor_overhead(),
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
return false; return false;
} }
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
// sanitize tensor->data
{
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
}
}
const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset); const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
ggml_backend_tensor_set(tensor, data, offset, size); ggml_backend_tensor_set(tensor, data, offset, size);
ggml_free(ctx); ggml_free(ctx);
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
return false; return false;
} }
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size); GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
// sanitize tensor->data
{
const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
}
}
// output serialization format: | data (size bytes) | // output serialization format: | data (size bytes) |
output.resize(size, 0); output.resize(size, 0);
ggml_backend_tensor_get(tensor, output.data(), offset, size); ggml_backend_tensor_get(tensor, output.data(), offset, size);

View File

@ -3724,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
struct ggml_tensor * view_src, struct ggml_tensor * view_src,
size_t view_offs) { size_t view_offs) {
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS); GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
// find the base tensor and absolute offset // find the base tensor and absolute offset
if (view_src != NULL && view_src->view_src != NULL) { if (view_src != NULL && view_src->view_src != NULL) {