server: add test for token probs (#7347)

2024-12-24 10:24:35 +00:00 · 2024-05-19 16:26:02 +02:00 · 2024-05-19 16:26:02 +02:00 · 1b01f06db0
commit 1b01f06db0
parent 41858392e1
3 changed files with 81 additions and 10 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -48,7 +48,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
 - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
+- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@ -70,12 +70,48 @@ Feature: Results
    Then all predictions are equal
    Examples:
      | n_parallel | temp |
-      |  1         | 0.0  |
+      | 1          | 0.0  |
-      |  2         | 0.0  |
+      | 2          | 0.0  |
-      |  4         | 0.0  |
+      | 4          | 0.0  |
-      |  1         | 1.0  |
+      | 1          | 1.0  |
-      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      # FIXME: These tests fail on master.
      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
-      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
-      # |  2         | 1.0  |
+      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
-      # |  4         | 1.0  |
+      # | 2          | 1.0  |
      # | 4          | 1.0  |
  Scenario Outline: consistent token probs with same seed and prompt
    Given <n_slots> slots
    And   <n_kv> KV cache size
    And   1.0 temperature
    And   <n_predict> max tokens to predict
    Then  the server is starting
    Then  the server is healthy
    Given 1 prompts "The meaning of life is" with seed 42
    And   concurrent completion requests
    # Then the server is busy # Not all slots will be utilized.
    Then  the server is idle
    And   all slots are idle
    Given <n_parallel> prompts "The meaning of life is" with seed 42
    And   concurrent completion requests
    # Then the server is busy # Not all slots will be utilized.
    Then the server is idle
    And  all slots are idle
    Then all token probabilities are equal
    Examples:
      | n_slots | n_kv | n_predict | n_parallel |
      | 4       | 1024 | 1         | 1          |
      | 4       | 1024 | 1         | 4          |
      # FIXME: These tests fail on master.
      # Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
      # and https://github.com/ggerganov/llama.cpp/pull/7347 .
      # | 4       | 1024 | 100       | 1          |
      # This test still fails even the above patches; the first token probabilities are already different.
      # | 4       | 1024 | 100       | 4          |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -23,6 +23,7 @@ from prometheus_client import parser
 def step_server_config(context, server_fqdn, server_port):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
    context.n_threads = None
    context.n_gpu_layer = None
    if 'PORT' in os.environ:
        context.server_port = int(os.environ['PORT'])
@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
    context.n_gpu_layer = ngl
@step('{n_threads:d} threads')
 def step_n_threads(context, n_threads):
    context.n_thread = n_threads
@step('{draft:d} as draft')
 def step_draft(context, draft):
    context.draft = draft
@ -274,13 +280,22 @@ async def step_predictions_equal(context):
@step('all predictions are different')
@async_run_until_complete
-async def step_predictions_equal(context):
+async def step_predictions_different(context):
    n_completions = await gather_tasks_results(context)
    assert n_completions >= 2, "need at least 2 completions"
    assert_all_predictions_different(context.tasks_result)
    context.tasks_result = []
@step('all token probabilities are equal')
@async_run_until_complete
 async def step_token_probabilities_equal(context):
    n_completions = await gather_tasks_results(context)
    assert n_completions >= 2, "need at least 2 completions"
    assert_all_token_probabilities_equal(context.tasks_result)
    context.tasks_result = []
@step('the completion is  truncated')
 def step_assert_completion_truncated(context):
    step_assert_completion_truncated(context, '')
@ -869,6 +884,7 @@ async def request_completion(prompt,
                                    "id_slot": id_slot,
                                    "seed": seed if seed is not None else 42,
                                    "temperature": temperature if temperature is not None else "0.8f",
                                    "n_probs": 2,
                                },
                                headers=headers,
                                timeout=3600) as response:
@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
        assert content_i != content_j, "contents not different"
 def assert_all_token_probabilities_equal(completion_responses):
    n_predict = len(completion_responses[0]['completion_probabilities'])
    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
        for pos in range(n_predict):
            for i, response_i in enumerate(completion_responses):
                probs_i = response_i['completion_probabilities'][pos]['probs']
                print(f"pos {pos}, probs {i}: {probs_i}")
    for pos in range(n_predict):
        for i, response_i in enumerate(completion_responses):
            probs_i = response_i['completion_probabilities'][pos]['probs']
            for j, response_j in enumerate(completion_responses):
                if i == j:
                    continue
                probs_j = response_j['completion_probabilities'][pos]['probs']
            assert probs_i == probs_j, "contents not equal"
 async def gather_tasks_results(context):
    n_tasks = len(context.concurrent_tasks)
    if context.debug:
@ -1261,6 +1294,8 @@ def start_server_background(context):
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
        server_args.extend(['--ubatch-size', context.n_ubatch])
    if context.n_threads:
        server_args.extend(['--threads', context.threads])
    if context.n_gpu_layer:
        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
    if context.draft is not None: