mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
batched-bench : add --output-format jsonl
option (#9293)
`--output-format` is modeled after `llama-bench`'s options
This commit is contained in:
parent
409dc4f8bb
commit
815b1fb20a
@ -1678,6 +1678,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||||||
else { invalid_param = true; }
|
else { invalid_param = true; }
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--output-format") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||||
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||||
|
else { invalid_param = true; }
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--no-warmup") {
|
if (arg == "--no-warmup") {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
return true;
|
return true;
|
||||||
@ -2068,6 +2076,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||||
|
|
||||||
|
options.push_back({ "batched-bench" });
|
||||||
|
options.push_back({ "batched-bench", " --output-format {md,jsonl}", "output format for batched-bench results (default: md)" });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
|
||||||
for (const auto & o : options) {
|
for (const auto & o : options) {
|
||||||
|
@ -275,6 +275,9 @@ struct gpt_params {
|
|||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
|
// batched-bench params
|
||||||
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
void gpt_params_parse_from_env(gpt_params & params);
|
void gpt_params_parse_from_env(gpt_params & params);
|
||||||
|
@ -49,3 +49,12 @@ There are 2 modes of operation:
|
|||||||
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
||||||
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
||||||
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
||||||
|
|
||||||
|
### JSONL output
|
||||||
|
|
||||||
|
Pass `--output-format jsonl` to output JSONL instead of Markdown, á la
|
||||||
|
|
||||||
|
```json lines
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094}
|
||||||
|
{"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854}
|
||||||
|
```
|
||||||
|
@ -122,12 +122,13 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!params.batched_bench_output_jsonl) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||||
|
}
|
||||||
|
|
||||||
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
||||||
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
||||||
@ -195,10 +196,19 @@ int main(int argc, char ** argv) {
|
|||||||
const float speed_tg = pl*tg / t_tg;
|
const float speed_tg = pl*tg / t_tg;
|
||||||
const float speed = n_kv / t;
|
const float speed = n_kv / t;
|
||||||
|
|
||||||
|
if(params.batched_bench_output_jsonl) {
|
||||||
|
LOG_TEE(
|
||||||
|
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
|
||||||
|
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
|
||||||
|
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
|
||||||
|
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
|
||||||
|
);
|
||||||
|
} else {
|
||||||
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user