llama-bench : add JSONL (NDJSON) output mode (#9288)

* llama-bench : add JSONL (NDJSON) output mode

* llama-bench : update usage docs
This commit is contained in:
Aarni Koskela 2024-09-03 20:58:54 +03:00 committed by GitHub
parent b69a480af4
commit 8962422b1c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 127 additions and 81 deletions

View File

@ -14,7 +14,8 @@ Performance testing tool for llama.cpp.
1. [Markdown](#markdown) 1. [Markdown](#markdown)
2. [CSV](#csv) 2. [CSV](#csv)
3. [JSON](#json) 3. [JSON](#json)
4. [SQL](#sql) 4. [JSONL](#jsonl)
5. [SQL](#sql)
## Syntax ## Syntax
@ -23,27 +24,34 @@ usage: ./llama-bench [options]
options: options:
-h, --help -h, --help
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
-pg <pp,tg> (default: 512,128) -pg <pp,tg> (default: )
-b, --batch-size <n> (default: 2048) -b, --batch-size <n> (default: 2048)
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16) -t, --threads <n> (default: 8)
-ngl, --n-gpu-layers <n> (default: 99) -C, --cpu-mask <hex,hex> (default: 0x0)
-sm, --split-mode <none|layer|row> (default: layer) --cpu-strict <0|1> (default: 0)
-mg, --main-gpu <i> (default: 0) --poll <0...100> (default: 50)
-nkvo, --no-kv-offload <0|1> (default: 0) -ngl, --n-gpu-layers <n> (default: 99)
-fa, --flash-attn <0|1> (default: 0) -rpc, --rpc <rpc_servers> (default: )
-mmp, --mmap <0|1> (default: 1) -sm, --split-mode <none|layer|row> (default: layer)
--numa <distribute|isolate|numactl> (default: disabled) -mg, --main-gpu <i> (default: 0)
-embd, --embeddings <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0) -fa, --flash-attn <0|1> (default: 0)
-r, --repetitions <n> (default: 5) -mmp, --mmap <0|1> (default: 1)
-o, --output <csv|json|md|sql> (default: md) --numa <distribute|isolate|numactl> (default: disabled)
-v, --verbose (default: 0) -embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5)
--prio <0|1|2|3> (default: 0)
--delay <0...N> (seconds) (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
``` ```
@ -238,6 +246,19 @@ $ ./llama-bench -o json
] ]
``` ```
### JSONL
```sh
$ ./llama-bench -o jsonl
```
```json lines
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
```
### SQL ### SQL
SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database. SQL output is suitable for importing into a SQLite database. The output can be piped into the `sqlite3` command line tool to add the results to a database.

View File

@ -171,13 +171,14 @@ static std::string get_gpu_info() {
} }
// command line params // command line params
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL}; enum output_formats {NONE, CSV, JSON, JSONL, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) { static const char * output_format_str(output_formats format) {
switch (format) { switch (format) {
case NONE: return "none"; case NONE: return "none";
case CSV: return "csv"; case CSV: return "csv";
case JSON: return "json"; case JSON: return "json";
case JSONL: return "jsonl";
case MARKDOWN: return "md"; case MARKDOWN: return "md";
case SQL: return "sql"; case SQL: return "sql";
default: GGML_ABORT("invalid output format"); default: GGML_ABORT("invalid output format");
@ -191,6 +192,8 @@ static bool output_format_from_str(const std::string & s, output_formats & forma
format = CSV; format = CSV;
} else if (s == "json") { } else if (s == "json") {
format = JSON; format = JSON;
} else if (s == "jsonl") {
format = JSONL;
} else if (s == "md") { } else if (s == "md") {
format = MARKDOWN; format = MARKDOWN;
} else if (s == "sql") { } else if (s == "sql") {
@ -283,34 +286,34 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help\n"); printf(" -h, --help\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n"); printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n"); printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }
@ -1074,38 +1077,39 @@ struct csv_printer : public printer {
} }
}; };
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_json_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
struct json_printer : public printer { struct json_printer : public printer {
bool first = true; bool first = true;
static std::string escape_json(const std::string & value) {
std::string escaped;
for (auto c : value) {
if (c == '"') {
escaped += "\\\"";
} else if (c == '\\') {
escaped += "\\\\";
} else if (c <= 0x1f) {
char buf[8];
snprintf(buf, sizeof(buf), "\\u%04x", c);
escaped += buf;
} else {
escaped += c;
}
}
return escaped;
}
static std::string format_value(const std::string & field, const std::string & value) {
switch (test::get_field_type(field)) {
case test::STRING:
return "\"" + escape_json(value) + "\"";
case test::BOOL:
return value == "0" ? "false" : "true";
default:
return value;
}
}
void print_header(const cmd_params & params) override { void print_header(const cmd_params & params) override {
fprintf(fout, "[\n"); fprintf(fout, "[\n");
(void) params; (void) params;
@ -1114,7 +1118,7 @@ struct json_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) { void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size()); assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) { for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str()); fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
} }
} }
@ -1137,6 +1141,25 @@ struct json_printer : public printer {
} }
}; };
struct jsonl_printer : public printer {
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
assert(fields.size() == values.size());
for (size_t i = 0; i < fields.size(); i++) {
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
}
}
void print_test(const test & t) override {
fprintf(fout, "{");
print_fields(test::get_fields(), t.get_values());
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
fprintf(fout, "}\n");
fflush(fout);
}
};
struct markdown_printer : public printer { struct markdown_printer : public printer {
std::vector<std::string> fields; std::vector<std::string> fields;
@ -1437,6 +1460,8 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
return std::unique_ptr<printer>(new csv_printer()); return std::unique_ptr<printer>(new csv_printer());
case JSON: case JSON:
return std::unique_ptr<printer>(new json_printer()); return std::unique_ptr<printer>(new json_printer());
case JSONL:
return std::unique_ptr<printer>(new jsonl_printer());
case MARKDOWN: case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer()); return std::unique_ptr<printer>(new markdown_printer());
case SQL: case SQL: