mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
rpc : make RPC servers come first in the device list (#9296)
* rpc : make RPC servers come first in the device list * rpc : disable options for non-RPC builds * rpc : rpc_count always zero for non-RPC builds
This commit is contained in:
parent
9379d3cc17
commit
82e3b03c11
@ -1234,11 +1234,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
if (arg == "--rpc") {
|
if (arg == "--rpc") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.rpc_servers = argv[i];
|
params.rpc_servers = argv[i];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
if (arg == "--no-mmap") {
|
if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
return true;
|
return true;
|
||||||
@ -1929,7 +1931,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
||||||
|
|
||||||
options.push_back({ "backend" });
|
options.push_back({ "backend" });
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
||||||
|
#endif
|
||||||
|
|
||||||
if (llama_supports_mlock()) {
|
if (llama_supports_mlock()) {
|
||||||
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
||||||
|
@ -299,7 +299,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
|
#endif
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
@ -482,12 +484,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
} else if (arg == "-rpc" || arg == "--rpc") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rpc_servers.push_back(argv[i]);
|
params.rpc_servers.push_back(argv[i]);
|
||||||
|
#endif
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -3346,29 +3346,33 @@ static size_t llama_get_device_count(const llama_model & model) {
|
|||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||||
ggml_backend_buffer_type_t buft = nullptr;
|
ggml_backend_buffer_type_t buft = nullptr;
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
#ifdef GGML_USE_RPC
|
||||||
int dev_count = (int)llama_get_device_count(model);
|
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
if (gpu >= dev_count - rpc_count) {
|
#else
|
||||||
const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
|
int rpc_count = 0;
|
||||||
|
#endif
|
||||||
|
int local_gpu = gpu - rpc_count;
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
if (gpu < rpc_count) {
|
||||||
|
const char * endpoint = model.rpc_servers[gpu].c_str();
|
||||||
return ggml_backend_rpc_buffer_type(endpoint);
|
return ggml_backend_rpc_buffer_type(endpoint);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
buft = ggml_backend_metal_buffer_type();
|
buft = ggml_backend_metal_buffer_type();
|
||||||
#elif defined(GGML_USE_CUDA)
|
#elif defined(GGML_USE_CUDA)
|
||||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
buft = ggml_backend_cuda_buffer_type(local_gpu);
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
buft = ggml_backend_vk_buffer_type(gpu);
|
buft = ggml_backend_vk_buffer_type(local_gpu);
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
buft = ggml_backend_sycl_buffer_type(local_gpu);
|
||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
buft = ggml_backend_kompute_buffer_type(local_gpu);
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
buft = ggml_backend_cann_buffer_type(gpu);
|
buft = ggml_backend_cann_buffer_type(local_gpu);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (buft == nullptr) {
|
if (buft == nullptr) {
|
||||||
@ -3376,7 +3380,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
|
|||||||
}
|
}
|
||||||
return buft;
|
return buft;
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
GGML_UNUSED(gpu);
|
GGML_UNUSED(local_gpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
||||||
@ -3403,13 +3407,17 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_mo
|
|||||||
}
|
}
|
||||||
|
|
||||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||||
#if defined(GGML_USE_RPC)
|
#ifdef GGML_USE_RPC
|
||||||
int dev_count = (int)llama_get_device_count(model);
|
|
||||||
int rpc_count = (int)model.rpc_servers.size();
|
int rpc_count = (int)model.rpc_servers.size();
|
||||||
if (device >= dev_count - rpc_count) {
|
#else
|
||||||
|
int rpc_count = 0;
|
||||||
|
#endif
|
||||||
|
int local_device = device - rpc_count;
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
if (device < rpc_count) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
|
const char * endpoint = model.rpc_servers[device].c_str();
|
||||||
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
}
|
}
|
||||||
@ -3417,28 +3425,28 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|||||||
#if defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_CUDA)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
ggml_backend_cuda_get_device_memory(local_device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_SYCL)
|
#elif defined(GGML_USE_SYCL)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
ggml_backend_sycl_get_device_memory(local_device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
ggml_backend_vk_get_device_memory(local_device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#elif defined(GGML_USE_CANN)
|
#elif defined(GGML_USE_CANN)
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
ggml_backend_cann_get_device_memory(device, &free, &total);
|
ggml_backend_cann_get_device_memory(local_device, &free, &total);
|
||||||
return free;
|
return free;
|
||||||
#else
|
#else
|
||||||
return 1;
|
return 1;
|
||||||
#endif
|
#endif
|
||||||
GGML_UNUSED(model);
|
GGML_UNUSED(model);
|
||||||
GGML_UNUSED(device);
|
GGML_UNUSED(local_device);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -18186,6 +18194,20 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
// initialize backends
|
// initialize backends
|
||||||
|
#if defined(GGML_USE_RPC)
|
||||||
|
if (model->n_gpu_layers > 0) {
|
||||||
|
for (const auto & endpoint : model->rpc_servers) {
|
||||||
|
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
||||||
|
if (backend == nullptr) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
ctx->backends.push_back(backend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL)
|
#if defined(GGML_USE_METAL)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ctx->backend_metal = ggml_backend_metal_init();
|
ctx->backend_metal = ggml_backend_metal_init();
|
||||||
@ -18310,19 +18332,6 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(GGML_USE_RPC)
|
|
||||||
if (model->n_gpu_layers > 0) {
|
|
||||||
for (const auto & endpoint : model->rpc_servers) {
|
|
||||||
ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
|
|
||||||
if (backend == nullptr) {
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
|
|
||||||
llama_free(ctx);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
ctx->backends.push_back(backend);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
|
Loading…
Reference in New Issue
Block a user