tool-call: greedy sampling in server tests + tweak prompt

2025-01-13 04:00:16 +00:00 · 2024-10-31 04:38:22 +00:00 · 2024-10-31 04:38:22 +00:00 · 542853b34b
commit 542853b34b
parent be9de3ed8a
2 changed files with 24 additions and 15 deletions
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -93,6 +93,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.warmup = True
    context.use_jinja = False
    context.chat_template_file = None
+    context.greedy_sampling = False

    # infill
    context.infill_input_extra = None
@ -190,6 +191,11 @@ def step_no_warmup(context):
    context.warmup = False


+@step('greedy sampling')
+def step_greedy_sampling(context):
+    context.greedy_sampling = True
+
+
@step('a chat template file {file}')
 def step_chat_template_file(context, file):
    context.chat_template_file = file
@ -446,13 +452,13 @@ def step_python_tool(context):
        "type": "function",
        "function": {
            "name": "ipython",
-            "description": "",
+            "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
-                        "description": ""
+                        "description": "The code to run in the ipython interpreter."
                    }
                },
                "required": ["code"]
@ -1658,6 +1664,8 @@ def start_server_background(context):
        server_args.extend(['--lora', context.lora_file])
    if context.disable_ctx_shift:
        server_args.extend(['--no-context-shift'])
+    if context.greedy_sampling:
+        server_args.extend(['--samplers', 'top-k', '--top-k', '1'])
    if not context.warmup:
        server_args.extend(['--no-warmup'])

--- a/examples/server/tests/features/tool_call.feature
+++ b/examples/server/tests/features/tool_call.feature
@ -6,6 +6,7 @@ Feature: llama.cpp server
    Given a server listening on localhost:8080
    And   BOS token is 1
    And   42 as server seed
+    And   greedy sampling
    And   8192 KV cache size
    And   32 as batch size
    And   1 slots
@ -20,7 +21,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   <n_predict> max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   a tool choice required
    And   tools <tools>
    And   parallel tool calls is <parallel_tool_calls>
@ -38,11 +39,11 @@ Feature: llama.cpp server
      | NousResearch-Hermes-3-Llama-3.1-8B-tool_use   | 64        | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       | disabled |
      | NousResearch-Hermes-3-Llama-3.1-8B-tool_use   | 128       | ipython   | {"code": "Yes,"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |
      | meta-llama-Meta-Llama-3.1-8B-Instruct         | 64        | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       | disabled |
-      | meta-llama-Meta-Llama-3.1-8B-Instruct         | 64        | ipython   | {"code": "it and realed at the otter. Asked Dave Dasty, Daisy is a big, shiny blue. As"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |
+      | meta-llama-Meta-Llama-3.1-8B-Instruct         | 64        | ipython   | {"code": "it and realed at the otter. Asked Dave Daisy, Daisy is a big, shiny blue. As"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |
      | meta-llama-Llama-3.2-3B-Instruct              | 64        | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       | disabled |
      | meta-llama-Llama-3.2-3B-Instruct              | 64        | ipython   | {"code": "Yes,"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |
      | mistralai-Mistral-Nemo-Instruct-2407          | 128       | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       | disabled |
-      | mistralai-Mistral-Nemo-Instruct-2407          | 128       | ipython   | {"code": "It's a small cat."}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |
+      | mistralai-Mistral-Nemo-Instruct-2407          | 128       | ipython   | {"code": "It's a spector."}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled |


  Scenario Outline: Template <template_name> + tinystories model yields no tool call
@ -52,7 +53,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   <n_predict> max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   tools [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]
    And   an OAI compatible chat completions request with no api error
    Then  no tool is called
@ -71,7 +72,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   16 max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   tools []
    And   an OAI compatible chat completions request with no api error
    Then  no tool is called
@ -86,7 +87,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   256 max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   python tool
    And   parallel tool calls is disabled
    And   an OAI compatible chat completions request with no api error
@ -94,16 +95,16 @@ Feature: llama.cpp server

    Examples: Prompts
      | tool_name | tool_arguments                       | hf_repo                                              | hf_file                                 | template_override                             |
-      | ipython   | {"code": "print('Hello, World!')"}   | bartowski/Qwen2.5-7B-Instruct-GGUF                   | Qwen2.5-7B-Instruct-Q4_K_M.gguf         |                                               |
-      | ipython   | {"code": "print('Hello, World!')"}   | bartowski/Phi-3.5-mini-instruct-GGUF                 | Phi-3.5-mini-instruct-Q4_K_M.gguf       |                                               |
-      | ipython   | {"code": "print('Hello, World!')"}   | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF            | Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf     | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
-      | ipython   | {"code": "print('Hello World!')"}    | NousResearch/Hermes-3-Llama-3.1-8B-GGUF              | Hermes-3-Llama-3.1-8B.Q4_K_M.gguf       | NousResearch-Hermes-3-Llama-3.1-8B-tool_use   |
      | ipython   | {"code": "print('Hello, World!')"}   | bartowski/Mistral-Nemo-Instruct-2407-GGUF            | Mistral-Nemo-Instruct-2407-Q4_K_M.gguf  | mistralai-Mistral-Nemo-Instruct-2407          |
+      | ipython   | {"code": "print(\"Hello World\")"}   | bartowski/Qwen2.5-7B-Instruct-GGUF                   | Qwen2.5-7B-Instruct-Q4_K_M.gguf         |                                               |
+      | ipython   | {"code": "print('Hello, World!')"}   | bartowski/Phi-3.5-mini-instruct-GGUF                 | Phi-3.5-mini-instruct-Q4_K_M.gguf       |                                               |
+      | ipython   | {"code": "print('Hello, world!')"}   | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF            | Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf     | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
+      | ipython   | {"code": "print('hello world')"}    | NousResearch/Hermes-3-Llama-3.1-8B-GGUF              | Hermes-3-Llama-3.1-8B.Q4_K_M.gguf       | NousResearch-Hermes-3-Llama-3.1-8B-tool_use   |
      | ipython   | {"code": "print('Hello, World!'}"}   | lmstudio-community/Llama-3.2-1B-Instruct-GGUF        | Llama-3.2-1B-Instruct-Q4_K_M.gguf       | meta-llama-Llama-3.2-3B-Instruct              |
      | ipython   | {"code": "print("}                   | lmstudio-community/Llama-3.2-3B-Instruct-GGUF        | Llama-3.2-3B-Instruct-Q4_K_M.gguf       | meta-llama-Llama-3.2-3B-Instruct              |
      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF   | Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf  |                                               |
-      | ipython   | {"code": "print('Hello, World!')"}   | bartowski/functionary-small-v3.2-GGUF                | functionary-small-v3.2-Q8_0.gguf        | meetkai-functionary-medium-v3.2               |
      # | ipython   | {"code": "print('Hello, world!')"}   | bartowski/gemma-2-2b-it-GGUF                         | gemma-2-2b-it-Q4_K_M.gguf               |                                               |
+      # | ipython   | {"code": "print('Hello, World!')"}   | bartowski/functionary-small-v3.2-GGUF                | functionary-small-v3.2-Q8_0.gguf        | meetkai-functionary-medium-v3.2               |


  @slow
@ -114,7 +115,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   256 max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   parallel tool calls is disabled
    And   an OAI compatible chat completions request with no api error
    Then  no tool is called
@ -128,7 +129,7 @@ Feature: llama.cpp server
    And   the server is healthy
    And   a model test
    And   256 max tokens to predict
-    And   a user prompt write a hello world in python
+    And   a user prompt say hello world with python
    And   a tool choice none
    And   python tool
    And   parallel tool calls is disabled