server/bench:

- support openAI streaming standard output with [DONE]\n\n
- export k6 raw results in csv
- fix too many tcp idle connection in tcp_wait
- add metric time to emit first token
This commit is contained in:
Pierrick HYMBERT 2024-12-10 17:18:16 +01:00
parent 26a8406ba9
commit 1bf38cffdf
3 changed files with 21 additions and 6 deletions

View File

@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
Example: Example (assuming golang >= 1.21 is installed):
```shell ```shell
go install go.k6.io/xk6/cmd/xk6@latest go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \ $GOPATH/bin/xk6 build master \
--with github.com/phymbert/xk6-sse --with github.com/phymbert/xk6-sse
``` ```
@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
Example: Example:
```shell ```shell
server --host localhost --port 8080 \ llama-server --host localhost --port 8080 \
--model ggml-model-q4_0.gguf \ --model ggml-model-q4_0.gguf \
--cont-batching \ --cont-batching \
--metrics \ --metrics \

View File

@ -214,11 +214,14 @@ def start_benchmark(args):
k6_args = [ k6_args = [
'run', args.scenario, 'run', args.scenario,
'--no-color', '--no-color',
'--no-connection-reuse',
'--no-vu-connection-reuse',
] ]
k6_args.extend(['--duration', args.duration]) k6_args.extend(['--duration', args.duration])
k6_args.extend(['--iterations', args.n_prompts]) k6_args.extend(['--iterations', args.n_prompts])
k6_args.extend(['--vus', args.parallel]) k6_args.extend(['--vus', args.parallel])
k6_args.extend(['--summary-export', 'k6-results.json']) k6_args.extend(['--summary-export', 'k6-results.json'])
k6_args.extend(['--out', 'csv=k6-results.csv'])
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
print(f"bench: starting k6 with: {args}") print(f"bench: starting k6 with: {args}")

View File

@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@ -89,6 +90,9 @@ export default function () {
], ],
"model": model, "model": model,
"stream": true, "stream": true,
"stream_options": {
"include_usage": true, // False to be supported in llama.cpp server
},
"seed": 42, "seed": 42,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@ -105,12 +109,20 @@ export default function () {
client.on('event', function (event) { client.on('event', function (event) {
if (promptEvalEndTime == null) { if (promptEvalEndTime == null) {
promptEvalEndTime = new Date() promptEvalEndTime = new Date()
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
}
if (event.data === '[DONE]' || event.data === '') {
return
} }
let chunk = JSON.parse(event.data) let chunk = JSON.parse(event.data)
let choice = chunk.choices[0]
if (choice.finish_reason) { if (chunk.choices && chunk.choices.length > 0) {
finish_reason = choice.finish_reason let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason
}
} }
if (chunk.usage) { if (chunk.usage) {