diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index c0a74153e..e922d8ec0 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -93,6 +93,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.warmup = True context.use_jinja = False context.chat_template_file = None + context.greedy_sampling = False # infill context.infill_input_extra = None @@ -190,6 +191,11 @@ def step_no_warmup(context): context.warmup = False +@step('greedy sampling') +def step_greedy_sampling(context): + context.greedy_sampling = True + + @step('a chat template file {file}') def step_chat_template_file(context, file): context.chat_template_file = file @@ -446,13 +452,13 @@ def step_python_tool(context): "type": "function", "function": { "name": "ipython", - "description": "", + "description": "Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", "parameters": { "type": "object", "properties": { "code": { "type": "string", - "description": "" + "description": "The code to run in the ipython interpreter." } }, "required": ["code"] @@ -1658,6 +1664,8 @@ def start_server_background(context): server_args.extend(['--lora', context.lora_file]) if context.disable_ctx_shift: server_args.extend(['--no-context-shift']) + if context.greedy_sampling: + server_args.extend(['--samplers', 'top-k', '--top-k', '1']) if not context.warmup: server_args.extend(['--no-warmup']) diff --git a/examples/server/tests/features/tool_call.feature b/examples/server/tests/features/tool_call.feature index 7f8c0449e..4d5b7afa2 100644 --- a/examples/server/tests/features/tool_call.feature +++ b/examples/server/tests/features/tool_call.feature @@ -6,6 +6,7 @@ Feature: llama.cpp server Given a server listening on localhost:8080 And BOS token is 1 And 42 as server seed + And greedy sampling And 8192 KV cache size And 32 as batch size And 1 slots @@ -20,7 +21,7 @@ Feature: llama.cpp server And the server is healthy And a model test And max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And a tool choice required And tools And parallel tool calls is @@ -38,11 +39,11 @@ Feature: llama.cpp server | NousResearch-Hermes-3-Llama-3.1-8B-tool_use | 64 | test | {} | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}] | disabled | | NousResearch-Hermes-3-Llama-3.1-8B-tool_use | 128 | ipython | {"code": "Yes,"} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | | meta-llama-Meta-Llama-3.1-8B-Instruct | 64 | test | {} | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}] | disabled | - | meta-llama-Meta-Llama-3.1-8B-Instruct | 64 | ipython | {"code": "it and realed at the otter. Asked Dave Dasty, Daisy is a big, shiny blue. As"} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | + | meta-llama-Meta-Llama-3.1-8B-Instruct | 64 | ipython | {"code": "it and realed at the otter. Asked Dave Daisy, Daisy is a big, shiny blue. As"} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | | meta-llama-Llama-3.2-3B-Instruct | 64 | test | {} | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}] | disabled | | meta-llama-Llama-3.2-3B-Instruct | 64 | ipython | {"code": "Yes,"} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | | mistralai-Mistral-Nemo-Instruct-2407 | 128 | test | {} | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}] | disabled | - | mistralai-Mistral-Nemo-Instruct-2407 | 128 | ipython | {"code": "It's a small cat."} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | + | mistralai-Mistral-Nemo-Instruct-2407 | 128 | ipython | {"code": "It's a spector."} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | Scenario Outline: Template + tinystories model yields no tool call @@ -52,7 +53,7 @@ Feature: llama.cpp server And the server is healthy And a model test And max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And tools [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}] And an OAI compatible chat completions request with no api error Then no tool is called @@ -71,7 +72,7 @@ Feature: llama.cpp server And the server is healthy And a model test And 16 max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And tools [] And an OAI compatible chat completions request with no api error Then no tool is called @@ -86,7 +87,7 @@ Feature: llama.cpp server And the server is healthy And a model test And 256 max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And python tool And parallel tool calls is disabled And an OAI compatible chat completions request with no api error @@ -94,16 +95,16 @@ Feature: llama.cpp server Examples: Prompts | tool_name | tool_arguments | hf_repo | hf_file | template_override | - | ipython | {"code": "print('Hello, World!')"} | bartowski/Qwen2.5-7B-Instruct-GGUF | Qwen2.5-7B-Instruct-Q4_K_M.gguf | | - | ipython | {"code": "print('Hello, World!')"} | bartowski/Phi-3.5-mini-instruct-GGUF | Phi-3.5-mini-instruct-Q4_K_M.gguf | | - | ipython | {"code": "print('Hello, World!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use | - | ipython | {"code": "print('Hello World!')"} | NousResearch/Hermes-3-Llama-3.1-8B-GGUF | Hermes-3-Llama-3.1-8B.Q4_K_M.gguf | NousResearch-Hermes-3-Llama-3.1-8B-tool_use | | ipython | {"code": "print('Hello, World!')"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q4_K_M.gguf | mistralai-Mistral-Nemo-Instruct-2407 | + | ipython | {"code": "print(\"Hello World\")"} | bartowski/Qwen2.5-7B-Instruct-GGUF | Qwen2.5-7B-Instruct-Q4_K_M.gguf | | + | ipython | {"code": "print('Hello, World!')"} | bartowski/Phi-3.5-mini-instruct-GGUF | Phi-3.5-mini-instruct-Q4_K_M.gguf | | + | ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use | + | ipython | {"code": "print('hello world')"} | NousResearch/Hermes-3-Llama-3.1-8B-GGUF | Hermes-3-Llama-3.1-8B.Q4_K_M.gguf | NousResearch-Hermes-3-Llama-3.1-8B-tool_use | | ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct | | ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct | | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf | | - | ipython | {"code": "print('Hello, World!')"} | bartowski/functionary-small-v3.2-GGUF | functionary-small-v3.2-Q8_0.gguf | meetkai-functionary-medium-v3.2 | # | ipython | {"code": "print('Hello, world!')"} | bartowski/gemma-2-2b-it-GGUF | gemma-2-2b-it-Q4_K_M.gguf | | + # | ipython | {"code": "print('Hello, World!')"} | bartowski/functionary-small-v3.2-GGUF | functionary-small-v3.2-Q8_0.gguf | meetkai-functionary-medium-v3.2 | @slow @@ -114,7 +115,7 @@ Feature: llama.cpp server And the server is healthy And a model test And 256 max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And parallel tool calls is disabled And an OAI compatible chat completions request with no api error Then no tool is called @@ -128,7 +129,7 @@ Feature: llama.cpp server And the server is healthy And a model test And 256 max tokens to predict - And a user prompt write a hello world in python + And a user prompt say hello world with python And a tool choice none And python tool And parallel tool calls is disabled