mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
rpc : add backend registry / device interfaces (#9812)
* rpc : add backend registry / device interfaces * llama : add llama_supports_rpc API * ggml_backend_rpc_start_rpc_server -> ggml_backend_rpc_start_server
This commit is contained in:
parent
cf8e0a3bb9
commit
0e9f760eb1
@ -1353,15 +1353,15 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
params.image.emplace_back(value);
|
params.image.emplace_back(value);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
).set_examples({LLAMA_EXAMPLE_LLAVA}));
|
||||||
#ifdef GGML_USE_RPC
|
if (llama_supports_rpc()) {
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--rpc"}, "SERVERS",
|
{"--rpc"}, "SERVERS",
|
||||||
"comma separated list of RPC servers",
|
"comma separated list of RPC servers",
|
||||||
[](gpt_params & params, const std::string & value) {
|
[](gpt_params & params, const std::string & value) {
|
||||||
params.rpc_servers = value;
|
params.rpc_servers = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_RPC"));
|
).set_env("LLAMA_ARG_RPC"));
|
||||||
#endif
|
}
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--mlock"},
|
{"--mlock"},
|
||||||
"force system to keep model in RAM rather than swapping or compressing",
|
"force system to keep model in RAM rather than swapping or compressing",
|
||||||
|
@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
#ifdef GGML_USE_RPC
|
if (llama_supports_rpc()) {
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
#endif
|
}
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
#ifdef GGML_USE_RPC
|
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
||||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rpc_servers.push_back(argv[i]);
|
params.rpc_servers.push_back(argv[i]);
|
||||||
#endif
|
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -151,7 +151,7 @@ int main(int argc, char * argv[]) {
|
|||||||
get_backend_memory(&free_mem, &total_mem);
|
get_backend_memory(&free_mem, &total_mem);
|
||||||
}
|
}
|
||||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
|
|||||||
|
|
||||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -542,6 +542,10 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
|||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
|
#include "ggml-rpc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_registry {
|
struct ggml_backend_registry {
|
||||||
std::vector<ggml_backend_reg_t> backends;
|
std::vector<ggml_backend_reg_t> backends;
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
@ -556,6 +560,9 @@ struct ggml_backend_registry {
|
|||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
register_backend(ggml_backend_blas_reg());
|
register_backend(ggml_backend_blas_reg());
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
|
register_backend(ggml_backend_rpc_reg());
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: sycl, vulkan, kompute, cann
|
// TODO: sycl, vulkan, kompute, cann
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@
|
|||||||
# include <netdb.h>
|
# include <netdb.h>
|
||||||
# include <unistd.h>
|
# include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
#include <string.h>
|
#include <cstring>
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
@ -630,22 +630,6 @@ static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, g
|
|||||||
return (enum ggml_status)output[0];
|
return (enum ggml_status)output[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
|
||||||
UNUSED(backend);
|
|
||||||
UNUSED(op);
|
|
||||||
//TODO: call the remote backend and cache the results
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
||||||
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
|
||||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
|
||||||
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||||
/* .get_name = */ ggml_backend_rpc_name,
|
/* .get_name = */ ggml_backend_rpc_name,
|
||||||
/* .free = */ ggml_backend_rpc_free,
|
/* .free = */ ggml_backend_rpc_free,
|
||||||
@ -659,8 +643,8 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
/* .supports_op = */ NULL,
|
||||||
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
/* .supports_buft = */ NULL,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
@ -691,7 +675,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * en
|
|||||||
|
|
||||||
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
||||||
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||||
/* .context = */ buft_ctx
|
/* .context = */ buft_ctx
|
||||||
};
|
};
|
||||||
buft_map[endpoint] = buft;
|
buft_map[endpoint] = buft;
|
||||||
@ -707,7 +691,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|||||||
ggml_backend_t backend = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
/* .guid = */ ggml_backend_rpc_guid(),
|
/* .guid = */ ggml_backend_rpc_guid(),
|
||||||
/* .interface = */ ggml_backend_rpc_interface,
|
/* .interface = */ ggml_backend_rpc_interface,
|
||||||
/* .device = */ nullptr,
|
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||||
/* .context = */ ctx
|
/* .context = */ ctx
|
||||||
};
|
};
|
||||||
return backend;
|
return backend;
|
||||||
@ -1189,7 +1173,7 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
||||||
std::string host;
|
std::string host;
|
||||||
int port;
|
int port;
|
||||||
if (!parse_endpoint(endpoint, host, port)) {
|
if (!parse_endpoint(endpoint, host, port)) {
|
||||||
@ -1226,3 +1210,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
|
|||||||
WSACleanup();
|
WSACleanup();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// device interface
|
||||||
|
|
||||||
|
struct ggml_backend_rpc_device_context {
|
||||||
|
std::string endpoint;
|
||||||
|
std::string name;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
|
||||||
|
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
||||||
|
|
||||||
|
UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||||
|
// TODO: obtain value from the server
|
||||||
|
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||||
|
|
||||||
|
UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
props->name = ggml_backend_rpc_device_get_name(dev);
|
||||||
|
props->description = ggml_backend_rpc_device_get_description(dev);
|
||||||
|
props->type = ggml_backend_rpc_device_get_type(dev);
|
||||||
|
ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
props->caps = {
|
||||||
|
/* .async = */ false,
|
||||||
|
/* .host_buffer = */ false,
|
||||||
|
/* .buffer_from_host_ptr = */ false,
|
||||||
|
/* .events = */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||||
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
|
||||||
|
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
||||||
|
|
||||||
|
UNUSED(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
|
||||||
|
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||||
|
|
||||||
|
UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
|
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
||||||
|
UNUSED(dev);
|
||||||
|
UNUSED(max_tensor_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
|
UNUSED(dev);
|
||||||
|
UNUSED(op);
|
||||||
|
//TODO: call the remote backend and cache the results
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
|
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||||
|
ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||||
|
return buft_ctx->endpoint == dev_ctx->endpoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||||
|
/* .get_name = */ ggml_backend_rpc_device_get_name,
|
||||||
|
/* .get_description = */ ggml_backend_rpc_device_get_description,
|
||||||
|
/* .get_memory = */ ggml_backend_rpc_device_get_memory,
|
||||||
|
/* .get_type = */ ggml_backend_rpc_device_get_type,
|
||||||
|
/* .get_props = */ ggml_backend_rpc_device_get_props,
|
||||||
|
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||||
|
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||||
|
/* .get_host_buffer_type = */ NULL,
|
||||||
|
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
||||||
|
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
|
/* .event_new = */ NULL,
|
||||||
|
/* .event_free = */ NULL,
|
||||||
|
/* .event_synchronize = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
// backend reg interface
|
||||||
|
|
||||||
|
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
|
return "RPC";
|
||||||
|
|
||||||
|
UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||||
|
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
||||||
|
|
||||||
|
UNUSED(reg);
|
||||||
|
UNUSED(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
|
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
|
||||||
|
return (void *)ggml_backend_rpc_add_device;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||||
|
/* .get_name = */ ggml_backend_rpc_reg_get_name,
|
||||||
|
/* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
|
||||||
|
/* .get_device = */ ggml_backend_rpc_reg_get_device,
|
||||||
|
/* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
||||||
|
static struct ggml_backend_reg ggml_backend_rpc_reg = {
|
||||||
|
/* .iface = */ ggml_backend_rpc_reg_i,
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_rpc_reg;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
|
||||||
|
static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
|
||||||
|
|
||||||
|
static std::mutex mutex;
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
|
||||||
|
if (dev_map.find(endpoint) != dev_map.end()) {
|
||||||
|
return dev_map[endpoint];
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
|
||||||
|
/* .endpoint = */ endpoint,
|
||||||
|
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||||
|
/* .iface = */ ggml_backend_rpc_device_i,
|
||||||
|
/* .reg = */ ggml_backend_rpc_reg(),
|
||||||
|
/* .context = */ ctx,
|
||||||
|
};
|
||||||
|
|
||||||
|
dev_map[endpoint] = dev;
|
||||||
|
|
||||||
|
return dev;
|
||||||
|
}
|
||||||
|
@ -433,6 +433,7 @@ extern "C" {
|
|||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
LLAMA_API bool llama_supports_gpu_offload(void);
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||||
|
LLAMA_API bool llama_supports_rpc (void);
|
||||||
|
|
||||||
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||||
|
@ -8,10 +8,6 @@
|
|||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#ifdef GGML_USE_RPC
|
|
||||||
# include "ggml-rpc.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_VULKAN)
|
||||||
# include "ggml-vulkan.h"
|
# include "ggml-vulkan.h"
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
@ -3404,10 +3400,6 @@ struct llama_lora_adapter {
|
|||||||
static int llama_get_device_count(const llama_model & model) {
|
static int llama_get_device_count(const llama_model & model) {
|
||||||
int count = (int) model.devices.size();
|
int count = (int) model.devices.size();
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
count += (int) model.rpc_servers.size();
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(GGML_USE_SYCL)
|
#if defined(GGML_USE_SYCL)
|
||||||
count += ggml_backend_sycl_get_device_count();
|
count += ggml_backend_sycl_get_device_count();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
@ -3460,15 +3452,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
|
|||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
|
||||||
if (device < rpc_count) {
|
|
||||||
const char * endpoint = model.rpc_servers[device].c_str();
|
|
||||||
return ggml_backend_rpc_buffer_type(endpoint);
|
|
||||||
}
|
|
||||||
device -= rpc_count;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (device < (int)model.devices.size()) {
|
if (device < (int)model.devices.size()) {
|
||||||
return ggml_backend_dev_buffer_type(model.devices[device]);
|
return ggml_backend_dev_buffer_type(model.devices[device]);
|
||||||
}
|
}
|
||||||
@ -3523,18 +3506,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
|
||||||
if (device < rpc_count) {
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
const char * endpoint = model.rpc_servers[device].c_str();
|
|
||||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
|
||||||
return free;
|
|
||||||
}
|
|
||||||
device = device - rpc_count;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (device < (int)model.devices.size()) {
|
if (device < (int)model.devices.size()) {
|
||||||
ggml_backend_dev_t dev = model.devices[device];
|
ggml_backend_dev_t dev = model.devices[device];
|
||||||
size_t total;
|
size_t total;
|
||||||
@ -19019,15 +18990,20 @@ bool llama_supports_mlock(void) {
|
|||||||
|
|
||||||
bool llama_supports_gpu_offload(void) {
|
bool llama_supports_gpu_offload(void) {
|
||||||
#if defined(GGML_USE_VULKAN) || \
|
#if defined(GGML_USE_VULKAN) || \
|
||||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
return true;
|
return true;
|
||||||
#else
|
#else
|
||||||
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
|
||||||
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
|
ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
|
||||||
|
llama_supports_rpc();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_supports_rpc(void) {
|
||||||
|
return ggml_backend_reg_by_name("RPC") != nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_backend_init(void) {
|
void llama_backend_init(void) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
@ -19102,6 +19078,36 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
model->rpc_servers.push_back(servers);
|
model->rpc_servers.push_back(servers);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add RPC devices
|
||||||
|
if (!model->rpc_servers.empty()) {
|
||||||
|
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
||||||
|
if (!rpc_reg) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
|
||||||
|
llama_free_model(model);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||||
|
using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
|
||||||
|
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
||||||
|
if (!ggml_backend_rpc_add_device_fn) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
|
||||||
|
llama_free_model(model);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const std::string & server : model->rpc_servers) {
|
||||||
|
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
||||||
|
if (dev) {
|
||||||
|
model->devices.push_back(dev);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
||||||
|
llama_free_model(model);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// create list of devices to use with this model
|
// create list of devices to use with this model
|
||||||
// currently, we use all available devices
|
// currently, we use all available devices
|
||||||
// TODO: rework API to give user more control over device selection
|
// TODO: rework API to give user more control over device selection
|
||||||
@ -19128,7 +19134,7 @@ struct llama_model * llama_load_model_from_file(
|
|||||||
} else if (status == -2) {
|
} else if (status == -2) {
|
||||||
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
|
||||||
}
|
}
|
||||||
delete model;
|
llama_free_model(model);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -19311,23 +19317,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
main_gpu -= (int)model->devices.size();
|
main_gpu -= (int)model->devices.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
if (model->n_gpu_layers > 0) {
|
|
||||||
for (const auto & endpoint : model->rpc_servers) {
|
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (main_gpu >= (int)model->rpc_servers.size()) {
|
|
||||||
main_gpu -= (int)model->rpc_servers.size();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(GGML_USE_VULKAN)
|
#if defined(GGML_USE_VULKAN)
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
|
||||||
|
Loading…
Reference in New Issue
Block a user