diff --git a/common/tool-call.cpp b/common/tool-call.cpp index 68ed0f494..ef7a2fb6e 100644 --- a/common/tool-call.cpp +++ b/common/tool-call.cpp @@ -462,8 +462,8 @@ llama_tool_call_handler llama_tool_call_handler_init( handler.grammar_trigger_words.push_back("[{\""); handler.grammar_trigger_words.push_back("[ { \""); } - auto tweaked_messages = add_system(messages, "Prefix any tool calls with [TOOL_CALLS]"); - handler.prompt = tmpl.apply(tweaked_messages, tools, /* add_generation_prompt= */ true); + // auto tweaked_messages = add_system(messages, "You are a helpful AI with tool calling capabilities. Prefix any tool calls with [TOOL_CALLS]"); + handler.prompt = tmpl.apply(messages, tools, /* add_generation_prompt= */ true); break; } case llama_tool_call_style::Llama31: diff --git a/examples/agent/run.py b/examples/agent/run.py index f4859edda..3dea29818 100644 --- a/examples/agent/run.py +++ b/examples/agent/run.py @@ -80,7 +80,7 @@ async def main( tool_map, tools = await discover_tools(tools or [], verbose) sys.stdout.write(f'🛠️ Tools: {", ".join(tool_map.keys()) if tool_map else ""}\n') - + try: messages = [] @@ -171,7 +171,7 @@ async def main( role='user', content=input('💬 ') )) - + except aiohttp.ClientResponseError as e: sys.stdout.write(f'💥 {e}\n') sys.exit(1) diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index e21e20fa7..142356931 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -4,13 +4,14 @@ import asyncio import json import os +import parse import re +import requests import socket import subprocess import sys import threading import time -import requests from collections.abc import Sequence from contextlib import closing from re import RegexFlag @@ -1617,7 +1618,10 @@ def start_server_background(context): def server_log(in_stream, out_stream): for line in iter(in_stream.readline, b''): - print(line.decode('utf-8'), end='', file=out_stream) + try: + print(line.decode('utf-8'), end='', file=out_stream) + except UnicodeDecodeError: + print(line, end='', file=out_stream) thread_stdout = threading.Thread(target=server_log, args=(context.server_process.stdout, sys.stdout)) thread_stdout.start() diff --git a/examples/server/tests/features/tool_call.feature b/examples/server/tests/features/tool_call.feature index 530565cba..583e7211f 100644 --- a/examples/server/tests/features/tool_call.feature +++ b/examples/server/tests/features/tool_call.feature @@ -13,7 +13,7 @@ Feature: llama.cpp server And jinja templates are enabled - Scenario Outline: OAI Compatibility w/ tools and required tool_choice ( template, tool) + Scenario Outline: Template + tinystories model w/ required tool_choice yields tool call Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a test chat template file named And the server is starting @@ -41,7 +41,7 @@ Feature: llama.cpp server | mistralai-Mistral-Nemo-Instruct-2407 | 128 | ipython | {"code": "It's a small cable."} | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] | disabled | - Scenario Outline: OAI Compatibility w/ tools and auto tool_choice ( template) + Scenario Outline: Template + tinystories model yields no tool call Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a test chat template file named And the server is starting @@ -60,22 +60,21 @@ Feature: llama.cpp server | meetkai-functionary-medium-v3.2 | 128 | - Scenario: OAI Compatibility w/ no tool + Scenario: Tool call template + tinystories and no tool won't call any tool Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja + And a test chat template file named meta-llama-Meta-Llama-3.1-8B-Instruct And the server is starting And the server is healthy And a model test And 16 max tokens to predict And a user prompt write a hello world in python - And a tool choice And tools [] And an OAI compatible chat completions request with no api error Then no tool is called @slow - Scenario Outline: OAI Compatibility w/ tools ( / with template) + Scenario Outline: Python hello world w/ + python tool yields tool call Given a model file from HF repo And a test chat template file named And no warmup @@ -83,7 +82,7 @@ Feature: llama.cpp server And the server is healthy And a model test And 256 max tokens to predict - And a user prompt write a hello world in python (use single quotes for strings) + And a user prompt write a hello world in python And python tool And parallel tool calls is disabled And an OAI compatible chat completions request with no api error @@ -91,11 +90,27 @@ Feature: llama.cpp server Examples: Prompts | tool_name | tool_arguments | hf_repo | hf_file | template_override | - | ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use | + | ipython | {"code": "print('Hello, World!')"} | bartowski/Phi-3.5-mini-instruct-GGUF | Phi-3.5-mini-instruct-Q4_K_M.gguf | | + | ipython | {"code": "print('Hello, World!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use | | ipython | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q8_0.gguf | mistralai-Mistral-Nemo-Instruct-2407 | | ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct | | ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q6_K.gguf | meta-llama-Llama-3.2-3B-Instruct | | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | | - | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | | + # | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | | + # | ipython | {"code": "print('Hello, world!')"} | bartowski/gemma-2-2b-it-GGUF | gemma-2-2b-it-Q4_K_M.gguf | | # | ipython | {"code": "print('Hello, World!')"} | meetkai/functionary-small-v3.2-GGUF | functionary-small-v3.2.Q4_0.gguf | meetkai-functionary-medium-v3.2 | + + @slow + Scenario Outline: Python hello world w/ + no tool yields no tool call + Given a model file Phi-3.5-mini-instruct-Q4_K_M.gguf from HF repo bartowski/Phi-3.5-mini-instruct-GGUF + And a test chat template file named + And no warmup + And the server is starting + And the server is healthy + And a model test + And 256 max tokens to predict + And a user prompt write a hello world in python + And parallel tool calls is disabled + And an OAI compatible chat completions request with no api error + Then no tool is called