From 1bf38cffdf932e23efb169e488def7c06bdb35d7 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Tue, 10 Dec 2024 17:18:16 +0100
Subject: [PATCH] server/bench: - support openAI streaming standard output with
 [DONE]\n\n - export k6 raw results in csv - fix too many tcp idle connection
 in tcp_wait - add metric time to emit first token

---
 examples/server/bench/README.md |  6 +++---
 examples/server/bench/bench.py  |  3 +++
 examples/server/bench/script.js | 18 +++++++++++++++---
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 353368e13..9549795ec 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).
 
 SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
 
-Example:
+Example (assuming golang >= 1.21 is installed):
 ```shell
 go install go.k6.io/xk6/cmd/xk6@latest
-xk6 build master \
+$GOPATH/bin/xk6 build master \
 --with github.com/phymbert/xk6-sse
 ```
 
@@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1
 
 Example:
 ```shell
-server --host localhost --port 8080 \
+llama-server --host localhost --port 8080 \
   --model ggml-model-q4_0.gguf \
   --cont-batching \
   --metrics \
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index a9ed747f5..bfad8d115 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -214,11 +214,14 @@ def start_benchmark(args):
     k6_args = [
         'run', args.scenario,
         '--no-color',
+        '--no-connection-reuse',
+        '--no-vu-connection-reuse',
     ]
     k6_args.extend(['--duration', args.duration])
     k6_args.extend(['--iterations', args.n_prompts])
     k6_args.extend(['--vus', args.parallel])
     k6_args.extend(['--summary-export', 'k6-results.json'])
+    k6_args.extend(['--out', 'csv=k6-results.csv'])
     args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
     args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
     print(f"bench: starting k6 with: {args}")
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index bdf4f5abc..2772bee5e 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
 
 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
 const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
+const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')
 
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -89,6 +90,9 @@ export default function () {
         ],
         "model": model,
         "stream": true,
+        "stream_options": {
+          "include_usage": true, // False to be supported in llama.cpp server
+        },
         "seed": 42,
         "max_tokens": max_tokens,
         "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
@@ -105,12 +109,20 @@ export default function () {
         client.on('event', function (event) {
             if (promptEvalEndTime == null) {
                 promptEvalEndTime = new Date()
+                llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
+            }
+
+            if (event.data === '[DONE]' || event.data === '') {
+                return
             }
 
             let chunk = JSON.parse(event.data)
-            let choice = chunk.choices[0]
-            if (choice.finish_reason) {
-                finish_reason = choice.finish_reason
+
+            if (chunk.choices && chunk.choices.length > 0) {
+                let choice = chunk.choices[0]
+                if (choice.finish_reason) {
+                    finish_reason = choice.finish_reason
+                }
             }
 
             if (chunk.usage) {