tool-call: more eager function call parsing for Functionary & Llama (give a chance to 3B model)

2025-01-13 12:10:18 +00:00 · 2024-09-28 18:30:59 +01:00 · 2024-09-28 18:30:59 +01:00 · 7cef90cf9c
commit 7cef90cf9c
parent 8b2cf3509f
3 changed files with 128 additions and 110 deletions
--- a/common/tool-call.cpp
+++ b/common/tool-call.cpp
@ -57,6 +57,56 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons
    }
 }

+/**
+ * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
+ * Aggregates the prefix, suffix and in-between text into the content.
+ */
+static llama_tool_calls parse_json_tool_calls(const json & tools, const std::string& input, const std::regex & function_regex, const std::regex & close_regex, bool check_names) {
+    std::smatch match;
+
+    llama_tool_calls result;
+    auto end = input.end();
+    auto it = input.begin();
+
+    std::unordered_set<std::string> tool_names;
+    if (check_names) {
+        for (const auto & tool : tools) {
+            if (tool.contains("type") && tool["type"] == "function") {
+                tool_names.insert(tool["function"]["name"]);
+            }
+        }
+    }
+
+    while (it != end) {
+        std::sregex_iterator rend;
+        std::sregex_iterator rit(it, end, function_regex);
+        if (rit == rend) {
+            result.content += std::string(it, end);
+            break;
+        }
+        auto name = rit->str(1);
+        if (check_names && tool_names.find(name) == tool_names.end()) {
+            result.content += std::string(it, rit->suffix().first);
+            break;
+        }
+
+        result.content += std::string(it, rit->prefix().second);
+        it = rit->suffix().first;
+
+
+        json arguments;
+        if (!parse_json(it, end, arguments)) {
+            throw std::runtime_error("Failed to parse json tool call arguments");
+        }
+        if (!std::regex_search(it, end, match, close_regex)) {
+            throw std::runtime_error("Malformed input, missing closing pattern");
+        }
+        it = match.suffix().first;
+        result.tool_calls.push_back({name, arguments.dump()});
+    }
+    return result;
+}
+
 static llama_tool_calls parse_hermes_tool_calls(const std::string& input) {
    try {
        std::regex start_pattern(R"([\n\s]*<tool_call>)");
@ -100,81 +150,21 @@ static llama_tool_calls parse_hermes_tool_calls(const std::string& input) {
    }
 }

-static llama_tool_calls parse_llama_3_1_tool_calls(const json & tools, const std::string& input) {
-    static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
-    std::smatch match;
-    if (std::regex_search(input, match, python_tag_regex)) {
-        return {
-            match.prefix().str(), {
-                {"ipython", (json {{"code", match[1].str()}}).dump()},
-            }
-        };
-    }
-    try {
-        auto call = json::parse(input);
-        // Only treat JSON as a tool call if it has a name attribute that matches any of the tools specified in the request.
-        // There doesn't seem to be any better way to detect a tool call.
-        if (call.contains("name") && call["name"].is_string()) {
-            std::string name = call["name"];
-            for (const auto & tool : tools) {
-                if (tool.at("function").at("name") == name) {
-                    return {
-                        "",
-                        {
-                            {name, call["parameters"].dump()},
-                        }
-                    };
+static llama_tool_calls parse_llama_3_tool_calls(const json & tools, const std::string& input, bool allow_python_tag) {
+    if (allow_python_tag) {
+        static std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)");
+        std::smatch match;
+        if (std::regex_search(input, match, python_tag_regex)) {
+            return {
+                match.prefix().str(), {
+                    {"ipython", (json {{"code", match[1].str()}}).dump()},
                }
-            }
-        }
-    } catch (const std::exception & e) {
-        // Do nothing
-    }
-    return {input, {}};
-}
-
-static llama_tool_calls parse_functionary_tool_calls(const json & tools, const std::string& input, const std::regex & function_regex, const std::regex & close_regex) {
-    std::smatch match;
-
-    llama_tool_calls result;
-    auto end = input.end();
-    auto it = input.begin();
-
-    std::unordered_set<std::string> tool_names;
-    for (const auto & tool : tools) {
-        if (tool.contains("type") && tool["type"] == "function") {
-            tool_names.insert(tool["function"]["name"]);
+            };
        }
    }
-
-    while (it != end) {
-        std::sregex_iterator rend;
-        std::sregex_iterator rit(it, end, function_regex);
-        if (rit == rend) {
-            result.content += std::string(it, end);
-            break;
-        }
-        auto name = rit->str(1);
-        if (tool_names.find(name) == tool_names.end()) {
-            result.content += std::string(it, rit->suffix().first);
-            break;
-        }
-
-        result.content += std::string(it, rit->prefix().second);
-        it = rit->suffix().first;
-
-
-        json arguments;
-        if (!parse_json(it, end, arguments)) {
-            throw std::runtime_error("Failed to parse json tool call arguments");
-        }
-        if (!std::regex_search(it, end, match, close_regex)) {
-            throw std::runtime_error("Malformed input, missing closing pattern");
-        }
-        it = match.suffix().first;
-        result.tool_calls.push_back({name, arguments.dump()});
-    }
-    return result;
+    static std::regex function_regex("(?:^|\\n)\\{\"name\": \"([^\"]+)\", \"parameters\": ");
+    static std::regex close_regex("\\}");
+    return parse_json_tool_calls(tools, input, function_regex, close_regex, /* check_names= */ false);
 }

 static llama_tool_calls parse_functionary_v3_llama_3_1_tool_calls(const json & tools, const std::string& input) {
@ -190,19 +180,21 @@ static llama_tool_calls parse_functionary_v3_llama_3_1_tool_calls(const json & t
    }
    static std::regex function_regex(R"(<function=(\w+)>)");
    static std::regex close_regex(R"(</function>)");
-    return parse_functionary_tool_calls(tools, input, function_regex, close_regex);
+    return parse_json_tool_calls(tools, input, function_regex, close_regex, /* check_names= */ false);
 }

 static llama_tool_calls parse_functionary_v3_tool_calls(const json & tools, const std::string& input) {
    static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
-    static std::regex close_regex(R"($|\n(?=>>>))");
-    return parse_functionary_tool_calls(tools, input, function_regex, close_regex);
+    static std::regex close_regex(R"($|(?=>>>))");
+    return parse_json_tool_calls(tools, input, function_regex, close_regex, /* check_names= */ true);
 }

 llama_tool_calls parse_tool_calls(llama_tool_call_style style, const json & tools, const std::string& input) {
    switch (style) {
        case llama_tool_call_style::Llama31:
-            return parse_llama_3_1_tool_calls(tools, input);
+            return parse_llama_3_tool_calls(tools, input, /* parse_llama_3_tool_calls= */ true);
+        case llama_tool_call_style::Llama32:
+            return parse_llama_3_tool_calls(tools, input, /* parse_llama_3_tool_calls= */ false);
        case llama_tool_call_style::FunctionaryV3Llama3:
            return parse_functionary_v3_tool_calls(tools, input);
        case llama_tool_call_style::FunctionaryV3Llama31:
@ -224,9 +216,19 @@ llama_tool_call_handler llama_tool_call_handler_init(
    llama_tool_call_handler handler;

    switch (tmpl.tool_call_style()) {
-        case llama_tool_call_style::Llama31: {
+        case llama_tool_call_style::Llama31:
+        case llama_tool_call_style::Llama32: {
+            static auto builtin_tools = json {"wolfram_alpha", "brave_search"};
+
+            auto uses_python_tag = tmpl.tool_call_style() == llama_tool_call_style::Llama31;
+
+            // Technically we should only trigger on `"\n{\"name\": \"" + name + "\""` for each tool name,
+            // but Llama-3.2-3B struggles to output valid tool calls so we're "guiding" it strongly as soon
+            // as it seems to be outputting some JSON.
+            // TODO: make this conditional on a very small model (e.g. 1B / 3B).
+            auto eagerly_match_any_json = true;
+
            handler.grammar = build_grammar([&](const llama_grammar_builder & builder) {
-                static std::vector<std::string> builtin_tools {"wolfram_alpha", "brave_search"};
                std::vector<std::string> tool_rules;

                for (const auto & tool : tools) {
@ -234,7 +236,7 @@ llama_tool_call_handler llama_tool_call_handler_init(
                    std::string name = function["name"];
                    auto parameters = function["parameters"];
                    builder.resolve_refs(parameters);
-                    if (name == "ipython" || std::find(builtin_tools.begin(), builtin_tools.end(), name) != builtin_tools.end()) {
+                    if (uses_python_tag && (name == "ipython" || builtin_tools.contains(name))) {
                        tool_rules.push_back(builder.add_rule("ipython-call", "\"<|python_tag|>\" .*"));
                        if (allow_content) {
                            handler.grammar_trigger_words.push_back("<|python_tag|>");
@ -244,15 +246,20 @@ llama_tool_call_handler llama_tool_call_handler_init(
                        tool_rules.push_back(
                            builder.add_rule(
                                name + "-call",
-                                "\"\\n{\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
+                                "\"\\n\"? \"{\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
                                    builder.add_schema(name + "-args", parameters) +
                                " \"}\""));
-                        if (allow_content) {
+                        if (allow_content && !eagerly_match_any_json) {
                            handler.grammar_trigger_words.push_back("\n{\"name\": \"" + name + "\"");
                        }
                    }
                }

+                if (allow_content && eagerly_match_any_json) {
+                    handler.grammar_trigger_words.push_back("\n{\"");
+                    handler.grammar_trigger_words.push_back("{\"");
+                }
+
                builder.add_rule("root", join(tool_rules.begin(), tool_rules.end(), " | "));
            });
            handler.additional_stop_words.push_back("<|eom_id|>");
@ -274,7 +281,7 @@ llama_tool_call_handler llama_tool_call_handler_init(
                    auto parameters = function["parameters"];
                    auto args_rule = builder.add_schema(name + "-args", parameters);
                    first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
-                    subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\"\\n>>>" + name + "\\n\" " + args_rule));
+                    subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
                    if (allow_content) {
                        handler.grammar_trigger_words.push_back(name + "\n");
                        handler.grammar_trigger_words.push_back(">>>" + name + "\n");
--- a/examples/agent/README.md
+++ b/examples/agent/README.md
@ -2,42 +2,47 @@

 - Install prerequisite: [uv](https://docs.astral.sh/uv/) (used to simplify python deps)

- Run `llama-server` w/ jinja templates:
+- Run `llama-server` w/ jinja templates. Note that most models need a template override (the HF to GGUF conversion only retains a single `chat_template`, but sometimes the models only support tool calls in an alternative chat template).

  ```bash
  make -j LLAMA_CURL=1 llama-server
-  ./llama-server \
-    --jinja -fa \
-    -mu https://huggingface.co/lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf
-  ```

-  <details>
-  <summary>Instructions for NousResearch/Hermes-2-Pro-Llama-3-8B (needs template override)</summary>
-
-  The HF model had two variants for its chat template (`default` and `tool_use`), but the GGUF only retained the `default` one.
-
-  ```bash
-  ./llama-server \
-    --jinja -fa \
-    -mu https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q8_0.gguf \
+  # Nous Hermes 2 Pro Llama 3 8B
+  ./llama-server --jinja -fa --verbose \
+    -hfr NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF -hff Hermes-2-Pro-Llama-3-8B-Q8_0.gguf \
    --chat-template-file tests/chat/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
-  ```
-`
-  </details>

-  <details>
-  <summary>Instructions for meekai/functionary-small-v3.2 (needs template override)</summary>
+  # Llama 3.1 8B
+  ./llama-server --jinja -fa --verbose \
+    -hfr lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF -hff Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf

-  The template in the GGUF doesn't support tool calls, but its bigger brother's template can be used:
-
-  ```bash
-  ./llama-server \
-    --jinja -fa \
-    -mu https://huggingface.co/meetkai/functionary-small-v3.2-GGUF/resolve/main/functionary-small-v3.2.Q4_0.gguf \
+  # functionary-small-v3
+  ./llama-server --jinja -fa --verbose \
+    -hfr meetkai/functionary-small-v3.2-GGUF -hff functionary-small-v3.2.Q4_0.gguf \
    --chat-template-file tests/chat/templates/meetkai-functionary-medium-v3.2.jinja
-  ```

-  </details>
+  ./llama-server --jinja -fa --verbose \
+    -m ~/Downloads/functionary-small-v3.2.Q4_0.gguf \
+    --chat-template-file tests/chat/templates/meetkai-functionary-medium-v3.2.jinja
+
+  # Llama 3.2 3B (poor adherence)
+  ./llama-server --jinja -fa --verbose \
+    -hfr lmstudio-community/Llama-3.2-3B-Instruct-GGUF -hff Llama-3.2-3B-Instruct-Q6_K_L.gguf \
+    --chat-template-file tests/chat/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
+
+  ./llama-server --jinja -fa --verbose \
+    -m ~/Downloads/Llama-3.2-3B-Instruct-Q6_K_L.gguf \
+    --chat-template-file tests/chat/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
+
+  # Llama 3.2 1B (very poor adherence)
+  ./llama-server --jinja -fa --verbose \
+    -hfr lmstudio-community/Llama-3.2-1B-Instruct-GGUF -hff Llama-3.2-1B-Instruct-Q4_K_M.gguf \
+    --chat-template-file tests/chat/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
+
+  # Llama 3.1 70B (untested)
+  ./llama-server --jinja -fa --verbose \
+    -hfr lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF -hff Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf
+  ```

 - Run some tools inside a docker container (check http://localhost:8088/docs once running):

@ -57,3 +62,7 @@
    --tool-endpoint http://localhost:8088 \
    --goal "What is the sum of 2535 squared and 32222000403?"
  ```
+
+## TODO
+
+- Implement code_interpreter using whichever tools are builtin for a given model.
--- a/examples/server/tests/features/tool_call.feature
+++ b/examples/server/tests/features/tool_call.feature
@ -35,7 +35,9 @@ Feature: llama.cpp server
      | meetkai-functionary-medium-v3.2       | 128       | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       |
      | meetkai-functionary-medium-v3.2       | 128       | ipython   | {"code": "Yes,"}       | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] |
      | meta-llama-Meta-Llama-3.1-8B-Instruct | 64        | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       |
-      | meta-llama-Meta-Llama-3.1-8B-Instruct | 16        | ipython   | {"code": "it and "}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] |
+      | meta-llama-Meta-Llama-3.1-8B-Instruct | 64        | ipython   | {"code": "it and realed at the otter. Asked Dave Dasty, Daisy is a big, shiny blue. As"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] |
+      | meta-llama-Llama-3.2-3B-Instruct      | 64        | test      | {}                     | [{"type":"function", "function": {"name": "test", "description": "", "parameters": {"type": "object", "properties": {}}}}]                                                                       |
+      | meta-llama-Llama-3.2-3B-Instruct      | 64        | ipython   | {"code": "Yes,"}    | [{"type":"function", "function": {"name": "ipython", "description": "", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": ""}}, "required": ["code"]}}}] |


  Scenario Outline: OAI Compatibility w/ tools and auto tool_choice