mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server/bench:
- support openAI streaming standard output with [DONE]\n\n - export k6 raw results in csv - fix too many tcp idle connection in tcp_wait - add metric time to emit first token
This commit is contained in:
parent
26a8406ba9
commit
1bf38cffdf
@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
|
|||||||
|
|
||||||
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
||||||
|
|
||||||
Example:
|
Example (assuming golang >= 1.21 is installed):
|
||||||
```shell
|
```shell
|
||||||
go install go.k6.io/xk6/cmd/xk6@latest
|
go install go.k6.io/xk6/cmd/xk6@latest
|
||||||
xk6 build master \
|
$GOPATH/bin/xk6 build master \
|
||||||
--with github.com/phymbert/xk6-sse
|
--with github.com/phymbert/xk6-sse
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
|
|||||||
|
|
||||||
Example:
|
Example:
|
||||||
```shell
|
```shell
|
||||||
server --host localhost --port 8080 \
|
llama-server --host localhost --port 8080 \
|
||||||
--model ggml-model-q4_0.gguf \
|
--model ggml-model-q4_0.gguf \
|
||||||
--cont-batching \
|
--cont-batching \
|
||||||
--metrics \
|
--metrics \
|
||||||
|
@ -214,11 +214,14 @@ def start_benchmark(args):
|
|||||||
k6_args = [
|
k6_args = [
|
||||||
'run', args.scenario,
|
'run', args.scenario,
|
||||||
'--no-color',
|
'--no-color',
|
||||||
|
'--no-connection-reuse',
|
||||||
|
'--no-vu-connection-reuse',
|
||||||
]
|
]
|
||||||
k6_args.extend(['--duration', args.duration])
|
k6_args.extend(['--duration', args.duration])
|
||||||
k6_args.extend(['--iterations', args.n_prompts])
|
k6_args.extend(['--iterations', args.n_prompts])
|
||||||
k6_args.extend(['--vus', args.parallel])
|
k6_args.extend(['--vus', args.parallel])
|
||||||
k6_args.extend(['--summary-export', 'k6-results.json'])
|
k6_args.extend(['--summary-export', 'k6-results.json'])
|
||||||
|
k6_args.extend(['--out', 'csv=k6-results.csv'])
|
||||||
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
|
||||||
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
|
||||||
print(f"bench: starting k6 with: {args}")
|
print(f"bench: starting k6 with: {args}")
|
||||||
|
@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
|||||||
|
|
||||||
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
||||||
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
||||||
|
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
|
||||||
|
|
||||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||||
@ -89,6 +90,9 @@ export default function () {
|
|||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"stream": true,
|
"stream": true,
|
||||||
|
"stream_options": {
|
||||||
|
"include_usage": true, // False to be supported in llama.cpp server
|
||||||
|
},
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
|
||||||
@ -105,12 +109,20 @@ export default function () {
|
|||||||
client.on('event', function (event) {
|
client.on('event', function (event) {
|
||||||
if (promptEvalEndTime == null) {
|
if (promptEvalEndTime == null) {
|
||||||
promptEvalEndTime = new Date()
|
promptEvalEndTime = new Date()
|
||||||
|
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.data === '[DONE]' || event.data === '') {
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
let chunk = JSON.parse(event.data)
|
let chunk = JSON.parse(event.data)
|
||||||
let choice = chunk.choices[0]
|
|
||||||
if (choice.finish_reason) {
|
if (chunk.choices && chunk.choices.length > 0) {
|
||||||
finish_reason = choice.finish_reason
|
let choice = chunk.choices[0]
|
||||||
|
if (choice.finish_reason) {
|
||||||
|
finish_reason = choice.finish_reason
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chunk.usage) {
|
if (chunk.usage) {
|
||||||
|
Loading…
Reference in New Issue
Block a user