From 1b01f06db0cff5f5f600bb754fc39fde565ed56a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 19 May 2024 16:26:02 +0200
Subject: [PATCH] server: add test for token probs (#7347)

---
 examples/server/README.md                     |  2 +-
 .../server/tests/features/results.feature     | 52 ++++++++++++++++---
 examples/server/tests/features/steps/steps.py | 37 ++++++++++++-
 3 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 4f3262cdd..0c3db8c84 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -48,7 +48,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
 - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
-- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
diff --git a/examples/server/tests/features/results.feature b/examples/server/tests/features/results.feature
index aa0b8d0c6..5deb278c2 100644
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@@ -70,12 +70,48 @@ Feature: Results
     Then all predictions are equal
     Examples:
       | n_parallel | temp |
-      |  1         | 0.0  |
-      |  2         | 0.0  |
-      |  4         | 0.0  |
-      |  1         | 1.0  |
-      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      | 1          | 0.0  |
+      | 2          | 0.0  |
+      | 4          | 0.0  |
+      | 1          | 1.0  |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
       # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
-      # |  2         | 1.0  |
-      # |  4         | 1.0  |
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 2          | 1.0  |
+      # | 4          | 1.0  |
+
+  Scenario Outline: consistent token probs with same seed and prompt
+    Given <n_slots> slots
+    And   <n_kv> KV cache size
+    And   1.0 temperature
+    And   <n_predict> max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then  the server is idle
+    And   all slots are idle
+
+    Given <n_parallel> prompts "The meaning of life is" with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then the server is idle
+    And  all slots are idle
+
+    Then all token probabilities are equal
+    Examples:
+      | n_slots | n_kv | n_predict | n_parallel |
+      | 4       | 1024 | 1         | 1          |
+      | 4       | 1024 | 1         | 4          |
+      # FIXME: These tests fail on master.
+      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
+      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
+      # | 4       | 1024 | 100       | 1          |
+      # This test still fails even the above patches; the first token probabilities are already different.
+      # | 4       | 1024 | 100       | 4          |
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 577b87af3..7da503f2c 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -23,6 +23,7 @@ from prometheus_client import parser
 def step_server_config(context, server_fqdn, server_port):
     context.server_fqdn = server_fqdn
     context.server_port = int(server_port)
+    context.n_threads = None
     context.n_gpu_layer = None
     if 'PORT' in os.environ:
         context.server_port = int(os.environ['PORT'])
@@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
     context.n_gpu_layer = ngl
 
 
+@step('{n_threads:d} threads')
+def step_n_threads(context, n_threads):
+    context.n_thread = n_threads
+
+
 @step('{draft:d} as draft')
 def step_draft(context, draft):
     context.draft = draft
@@ -274,13 +280,22 @@ async def step_predictions_equal(context):
 
 @step('all predictions are different')
 @async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
     n_completions = await gather_tasks_results(context)
     assert n_completions >= 2, "need at least 2 completions"
     assert_all_predictions_different(context.tasks_result)
     context.tasks_result = []
 
 
+@step('all token probabilities are equal')
+@async_run_until_complete
+async def step_token_probabilities_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_token_probabilities_equal(context.tasks_result)
+    context.tasks_result = []
+
+
 @step('the completion is  truncated')
 def step_assert_completion_truncated(context):
     step_assert_completion_truncated(context, '')
@@ -869,6 +884,7 @@ async def request_completion(prompt,
                                     "id_slot": id_slot,
                                     "seed": seed if seed is not None else 42,
                                     "temperature": temperature if temperature is not None else "0.8f",
+                                    "n_probs": 2,
                                 },
                                 headers=headers,
                                 timeout=3600) as response:
@@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
         assert content_i != content_j, "contents not different"
 
 
+def assert_all_token_probabilities_equal(completion_responses):
+    n_predict = len(completion_responses[0]['completion_probabilities'])
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        for pos in range(n_predict):
+            for i, response_i in enumerate(completion_responses):
+                probs_i = response_i['completion_probabilities'][pos]['probs']
+                print(f"pos {pos}, probs {i}: {probs_i}")
+    for pos in range(n_predict):
+        for i, response_i in enumerate(completion_responses):
+            probs_i = response_i['completion_probabilities'][pos]['probs']
+            for j, response_j in enumerate(completion_responses):
+                if i == j:
+                    continue
+                probs_j = response_j['completion_probabilities'][pos]['probs']
+            assert probs_i == probs_j, "contents not equal"
+
+
 async def gather_tasks_results(context):
     n_tasks = len(context.concurrent_tasks)
     if context.debug:
@@ -1261,6 +1294,8 @@ def start_server_background(context):
         server_args.extend(['--batch-size', context.n_batch])
     if context.n_ubatch:
         server_args.extend(['--ubatch-size', context.n_ubatch])
+    if context.n_threads:
+        server_args.extend(['--threads', context.threads])
     if context.n_gpu_layer:
         server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
     if context.draft is not None: