Merge commit from fork

2024-12-25 10:54:36 +00:00 · 2024-08-09 23:03:21 +03:00 · 2024-08-09 23:03:21 +03:00 · b72942fac9
commit b72942fac9
parent 6afd1a99dc
4 changed files with 53 additions and 3 deletions
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -1,5 +1,9 @@
 ## Overview
 > [!IMPORTANT]
 > This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
 > insecure. **Never run the RPC server on an open network or in a sensitive environment!**
 The `rpc-server` allows  running `ggml` backend on a remote host.
 The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
 This can be used for distributed LLM inference with `llama.cpp` in the following way:
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -16,7 +16,7 @@
 #include <stdio.h>
 struct rpc_server_params {
-    std::string host        = "0.0.0.0";
+    std::string host        = "127.0.0.1";
    int         port        = 50052;
    size_t      backend_mem = 0;
 };
@ -114,6 +114,17 @@ int main(int argc, char * argv[]) {
        fprintf(stderr, "Invalid parameters\n");
        return 1;
    }
    if (params.host != "127.0.0.1") {
        fprintf(stderr, "\n");
        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
        fprintf(stderr, "\n");
    }
    ggml_backend_t backend = create_backend();
    if (!backend) {
        fprintf(stderr, "Failed to create backend\n");
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
        return nullptr;
    }
    if (inet_addr(host) == INADDR_NONE) {
        fprintf(stderr, "Invalid host address: %s\n", host);
        return nullptr;
    }
    struct sockaddr_in serv_addr;
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_addr.s_addr = inet_addr(host);
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
        return nullptr;
    }
    // require that the tensor data does not go beyond the buffer end
    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
        result->op_params[i] = tensor->op_params[i];
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
    uint64_t offset;
    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
-    size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
+    const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
    struct ggml_init_params params {
        /*.mem_size   =*/ ggml_tensor_overhead(),
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
        }
    }
    const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
    ggml_backend_tensor_set(tensor, data, offset, size);
    ggml_free(ctx);
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
        }
    }
    // output serialization format: | data (size bytes) |
    output.resize(size, 0);
    ggml_backend_tensor_get(tensor, output.data(), offset, size);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3724,7 +3724,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_tensor  * view_src,
        size_t                view_offs) {
-    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
    GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
    // find the base tensor and absolute offset
    if (view_src != NULL && view_src->view_src != NULL) {