add --no-mmap in llama-bench (#5257)

* add --no-mmap, show sycl backend

* fix conflict

* fix code format, change print for --no-mmap

* ren no_mmap to mmap, show mmap when not default value in printer

* update guide for mmap

* mv position to reduce model reload
This commit is contained in:
Neo Zhang Jianyu 2024-02-02 03:48:53 +08:00 committed by GitHub
parent 4d0924a890
commit 128dcbd3c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 89 additions and 10 deletions

View File

@ -405,7 +405,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**. Solution: add **--no-mmap** or **--mmap 0**.
## Q&A ## Q&A

View File

@ -20,6 +20,7 @@
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml-sycl.h"
// utils // utils
static uint64_t get_time_ns() { static uint64_t get_time_ns() {
@ -120,6 +121,22 @@ static std::string get_gpu_info() {
id += "/"; id += "/";
} }
} }
#endif
#ifdef GGML_USE_SYCL
int device_list[GGML_SYCL_MAX_DEVICES];
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
if (device_list[i] >0 ){
char buf[128];
ggml_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
id += "/";
}
}
if (id.length() >2 ) {
id.pop_back();
}
#endif #endif
// TODO: other backends // TODO: other backends
return id; return id;
@ -161,6 +178,7 @@ struct cmd_params {
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q; std::vector<bool> mul_mat_q;
std::vector<std::vector<float>> tensor_split; std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* mul_mat_q */ {true}, /* mul_mat_q */ {true},
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") { } else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
return params; return params;
@ -460,6 +488,7 @@ struct cmd_params_instance {
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap;
llama_model_params to_llama_mparams() const { llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
@ -468,6 +497,7 @@ struct cmd_params_instance {
mparams.split_mode = split_mode; mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu; mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data(); mparams.tensor_split = tensor_split.data();
mparams.use_mmap = use_mmap;
return mparams; return mparams;
} }
@ -477,6 +507,7 @@ struct cmd_params_instance {
n_gpu_layers == other.n_gpu_layers && n_gpu_layers == other.n_gpu_layers &&
split_mode == other.split_mode && split_mode == other.split_mode &&
main_gpu == other.main_gpu && main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
tensor_split == other.tensor_split; tensor_split == other.tensor_split;
} }
@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & sm : params.split_mode) for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
for (const auto & mmp : params.use_mmap)
for (const auto & nb : params.n_batch) for (const auto & nb : params.n_batch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq, /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
/* .mul_mat_q = */ mmq, /* .mul_mat_q = */ mmq,
/* .tensor_split = */ ts, /* .tensor_split = */ ts,
/* .use_mmap = */ mmp,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -565,6 +599,7 @@ struct test {
static const bool vulkan; static const bool vulkan;
static const bool kompute; static const bool kompute;
static const bool metal; static const bool metal;
static const bool sycl;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
static const std::string cpu_info; static const std::string cpu_info;
@ -583,6 +618,7 @@ struct test {
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::vector<float> tensor_split; std::vector<float> tensor_split;
bool use_mmap;
int n_prompt; int n_prompt;
int n_gen; int n_gen;
std::string test_time; std::string test_time;
@ -605,6 +641,7 @@ struct test {
no_kv_offload = inst.no_kv_offload; no_kv_offload = inst.no_kv_offload;
mul_mat_q = inst.mul_mat_q; mul_mat_q = inst.mul_mat_q;
tensor_split = inst.tensor_split; tensor_split = inst.tensor_split;
use_mmap = inst.use_mmap;
n_prompt = inst.n_prompt; n_prompt = inst.n_prompt;
n_gen = inst.n_gen; n_gen = inst.n_gen;
// RFC 3339 date-time format // RFC 3339 date-time format
@ -654,25 +691,29 @@ struct test {
if (metal) { if (metal) {
return "Metal"; return "Metal";
} }
if (sycl) {
return GGML_SYCL_NAME;
}
if (gpu_blas) { if (gpu_blas) {
return "GPU BLAS"; return "GPU BLAS";
} }
if (blas) { if (blas) {
return "BLAS"; return "BLAS";
} }
return "CPU"; return "CPU";
} }
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "main_gpu", "no_kv_offload",
"mul_mat_q", "tensor_split", "mul_mat_q", "tensor_split", "use_mmap",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts"
@ -691,8 +732,8 @@ struct test {
return INT; return INT;
} }
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "mul_mat_q") { field == "mul_mat_q" || field == "use_mmap") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -720,13 +761,13 @@ struct test {
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(main_gpu), std::to_string(no_kv_offload),
std::to_string(mul_mat_q), tensor_split_str, std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ns()), std::to_string(stdev_ns()),
std::to_string(avg_ts()), std::to_string(stdev_ts()) std::to_string(avg_ts()), std::to_string(stdev_ts())
@ -753,6 +794,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const std::string test::cpu_info = get_cpu_info(); const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info(); const std::string test::gpu_info = get_gpu_info();
@ -895,6 +937,9 @@ struct markdown_printer : public printer {
if (field == "no_kv_offload") { if (field == "no_kv_offload") {
return "nkvo"; return "nkvo";
} }
if (field == "use_mmap") {
return "mmap";
}
if (field == "tensor_split") { if (field == "tensor_split") {
return "ts"; return "ts";
} }
@ -938,6 +983,9 @@ struct markdown_printer : public printer {
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.push_back("tensor_split"); fields.push_back("tensor_split");
} }
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
fields.push_back("use_mmap");
}
fields.push_back("test"); fields.push_back("test");
fields.push_back("t/s"); fields.push_back("t/s");

View File

@ -2928,7 +2928,6 @@ void ggml_sycl_set_main_device(int main_device);
void ggml_sycl_set_mul_mat_q(bool mul_mat_q); void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
void ggml_sycl_set_scratch_size(size_t scratch_size); void ggml_sycl_set_scratch_size(size_t scratch_size);
void ggml_sycl_free_scratch(void); void ggml_sycl_free_scratch(void);
int ggml_sycl_get_device_count(void);
void ggml_sycl_get_device_description(int device, char * description, size_t description_size); void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
bool ggml_backend_is_sycl(ggml_backend_t backend); bool ggml_backend_is_sycl(ggml_backend_t backend);
int ggml_backend_sycl_get_device(ggml_backend_t backend); int ggml_backend_sycl_get_device(ggml_backend_t backend);
@ -14493,6 +14492,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
return true; return true;
} }
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
int max_compute_units = -1;
for(int i=0;i<max_len;i++) id_list[i] = 0;
int device_count = dpct::dev_mgr::instance().device_count();
for(int id=0; id< device_count; id++){
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu()) continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
}
for(int id=0;id< device_count;id++){
sycl::device device = dpct::dev_mgr::instance().get_device(id);
if (!device.is_gpu()) continue;
dpct::device_info prop;
dpct::get_device_info(prop, device);
if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
id_list[id] = 1;
}
}
return;
}
catch (sycl::exception const &exc) {
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
<< ", line:" << __LINE__ << std::endl;
std::exit(1);
}
int ggml_sycl_get_device_count() try { int ggml_sycl_get_device_count() try {
int device_count; int device_count;
if (CHECK_TRY_ERROR(device_count = if (CHECK_TRY_ERROR(device_count =
@ -14507,7 +14537,7 @@ catch (sycl::exception const &exc) {
std::exit(1); std::exit(1);
} }
void ggml_sycl_get_device_description(int device, char *description, GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
size_t description_size) try { size_t description_size) try {
dpct::device_info prop; dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(

View File

@ -22,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif