From 29fd7b56d02e4de5ca2d736fbbe87f7fbdc0f1b0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 22 Dec 2024 19:34:32 +0200 Subject: [PATCH] llama : chat ggml-ci --- src/CMakeLists.txt | 1 + src/llama-arch.cpp | 47 ---- src/llama-arch.h | 39 ---- src/llama-chat.cpp | 524 +++++++++++++++++++++++++++++++++++++++++++++ src/llama-chat.h | 47 ++++ src/llama.cpp | 501 +------------------------------------------ 6 files changed, 576 insertions(+), 583 deletions(-) create mode 100644 src/llama-chat.cpp create mode 100644 src/llama-chat.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 666632c25..23bf194e0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,7 @@ add_library(llama llama.cpp llama-arch.cpp llama-batch.cpp + llama-chat.cpp llama-context.cpp llama-adapter.cpp llama-grammar.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 7465f4bc7..a447b3433 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1385,50 +1385,3 @@ llm_arch llm_arch_from_string(const std::string & name) { const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) { return LLM_TENSOR_INFOS.at(tensor); } - -// chat templates - -static const std::map LLM_CHAT_TEMPLATES = { - { "chatml", LLM_CHAT_TEMPLATE_CHATML }, - { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 }, - { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS }, - { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS }, - { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP }, - { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 }, - { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 }, - { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN }, - { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 }, - { "phi3", LLM_CHAT_TEMPLATE_PHI_3 }, - { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR }, - { "monarch", LLM_CHAT_TEMPLATE_MONARCH }, - { "gemma", LLM_CHAT_TEMPLATE_GEMMA }, - { "orion", LLM_CHAT_TEMPLATE_ORION }, - { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT }, - { "vicuna", LLM_CHAT_TEMPLATE_VICUNA }, - { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA }, - { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, - { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, - { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, - { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, - { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, - { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, - { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, - { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, - { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, - { "granite", LLM_CHAT_TEMPLATE_GRANITE }, - { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, -}; - -llm_chat_template llm_chat_template_from_str(const std::string & name) { - return LLM_CHAT_TEMPLATES.at(name); -} - -int32_t llama_chat_builtin_templates(const char ** output, size_t len) { - auto it = LLM_CHAT_TEMPLATES.begin(); - for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) { - output[i] = it->first.c_str(); - std::advance(it, 1); - } - return (int32_t) LLM_CHAT_TEMPLATES.size(); -} - diff --git a/src/llama-arch.h b/src/llama-arch.h index 976ef8f0c..7bc4e4ffd 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -388,42 +388,3 @@ const char * llm_arch_name(llm_arch arch); llm_arch llm_arch_from_string(const std::string & name); const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor); - -// -// Chat templates -// TODO: maybe move these to a separate module -// - -enum llm_chat_template { - LLM_CHAT_TEMPLATE_CHATML, - LLM_CHAT_TEMPLATE_LLAMA_2, - LLM_CHAT_TEMPLATE_LLAMA_2_SYS, - LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS, - LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP, - LLM_CHAT_TEMPLATE_MISTRAL_V1, - LLM_CHAT_TEMPLATE_MISTRAL_V3, - LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, - LLM_CHAT_TEMPLATE_MISTRAL_V7, - LLM_CHAT_TEMPLATE_PHI_3, - LLM_CHAT_TEMPLATE_ZEPHYR, - LLM_CHAT_TEMPLATE_MONARCH, - LLM_CHAT_TEMPLATE_GEMMA, - LLM_CHAT_TEMPLATE_ORION, - LLM_CHAT_TEMPLATE_OPENCHAT, - LLM_CHAT_TEMPLATE_VICUNA, - LLM_CHAT_TEMPLATE_VICUNA_ORCA, - LLM_CHAT_TEMPLATE_DEEPSEEK, - LLM_CHAT_TEMPLATE_DEEPSEEK_2, - LLM_CHAT_TEMPLATE_COMMAND_R, - LLM_CHAT_TEMPLATE_LLAMA_3, - LLM_CHAT_TEMPLATE_CHATGML_3, - LLM_CHAT_TEMPLATE_CHATGML_4, - LLM_CHAT_TEMPLATE_MINICPM, - LLM_CHAT_TEMPLATE_EXAONE_3, - LLM_CHAT_TEMPLATE_RWKV_WORLD, - LLM_CHAT_TEMPLATE_GRANITE, - LLM_CHAT_TEMPLATE_GIGACHAT, - LLM_CHAT_TEMPLATE_UNKNOWN, -}; - -llm_chat_template llm_chat_template_from_str(const std::string & name); diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp new file mode 100644 index 000000000..1e00b9901 --- /dev/null +++ b/src/llama-chat.cpp @@ -0,0 +1,524 @@ +#include "llama-chat.h" + +#include "llama.h" + +#include +#include + +#if __cplusplus >= 202000L + #define LU8(x) (const char*)(u8##x) +#else + #define LU8(x) u8##x +#endif + +// trim whitespace from the beginning and end of a string +static std::string trim(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && isspace(str[start])) { + start += 1; + } + while (end > start && isspace(str[end - 1])) { + end -= 1; + } + return str.substr(start, end - start); +} + +static const std::map LLM_CHAT_TEMPLATES = { + { "chatml", LLM_CHAT_TEMPLATE_CHATML }, + { "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 }, + { "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS }, + { "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS }, + { "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP }, + { "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 }, + { "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 }, + { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN }, + { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 }, + { "phi3", LLM_CHAT_TEMPLATE_PHI_3 }, + { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR }, + { "monarch", LLM_CHAT_TEMPLATE_MONARCH }, + { "gemma", LLM_CHAT_TEMPLATE_GEMMA }, + { "orion", LLM_CHAT_TEMPLATE_ORION }, + { "openchat", LLM_CHAT_TEMPLATE_OPENCHAT }, + { "vicuna", LLM_CHAT_TEMPLATE_VICUNA }, + { "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA }, + { "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK }, + { "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 }, + { "command-r", LLM_CHAT_TEMPLATE_COMMAND_R }, + { "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 }, + { "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 }, + { "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 }, + { "minicpm", LLM_CHAT_TEMPLATE_MINICPM }, + { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, + { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, + { "granite", LLM_CHAT_TEMPLATE_GRANITE }, + { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, +}; + +llm_chat_template llm_chat_template_from_str(const std::string & name) { + return LLM_CHAT_TEMPLATES.at(name); +} + +llm_chat_template llm_chat_detect_template(const std::string & tmpl) { + try { + return llm_chat_template_from_str(tmpl); + } catch (const std::out_of_range &) { + // ignore + } + + auto tmpl_contains = [&tmpl](const char * haystack) -> bool { + return tmpl.find(haystack) != std::string::npos; + }; + if (tmpl_contains("<|im_start|>")) { + return LLM_CHAT_TEMPLATE_CHATML; + } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { + if (tmpl_contains("[SYSTEM_PROMPT]")) { + return LLM_CHAT_TEMPLATE_MISTRAL_V7; + } else if ( + // catches official 'v1' template + tmpl_contains("' [INST] ' + system_message") + // catches official 'v3' and 'v3-tekken' templates + || tmpl_contains("[AVAILABLE_TOOLS]") + ) { + // Official mistral 'v1', 'v3' and 'v3-tekken' templates + // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md + // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md + if (tmpl_contains(" [INST]")) { + return LLM_CHAT_TEMPLATE_MISTRAL_V1; + } else if (tmpl_contains("\"[INST]\"")) { + return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN; + } + return LLM_CHAT_TEMPLATE_MISTRAL_V3; + } else { + // llama2 template and its variants + // [variant] support system message + // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + bool support_system_message = tmpl_contains("<>"); + bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]"); + bool strip_message = tmpl_contains("content.strip()"); + if (strip_message) { + return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP; + } else if (add_bos_inside_history) { + return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS; + } else if (support_system_message) { + return LLM_CHAT_TEMPLATE_LLAMA_2_SYS; + } else { + return LLM_CHAT_TEMPLATE_LLAMA_2; + } + } + } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { + return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { + return LLM_CHAT_TEMPLATE_ZEPHYR; + } else if (tmpl_contains("bos_token + message['role']")) { + return LLM_CHAT_TEMPLATE_MONARCH; + } else if (tmpl_contains("")) { + return LLM_CHAT_TEMPLATE_GEMMA; + } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) { + // OrionStarAI/Orion-14B-Chat + return LLM_CHAT_TEMPLATE_ORION; + } else if (tmpl_contains("GPT4 Correct ")) { + // openchat/openchat-3.5-0106 + return LLM_CHAT_TEMPLATE_OPENCHAT; + } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) { + // eachadea/vicuna-13b-1.1 (and Orca variant) + if (tmpl_contains("SYSTEM: ")) { + return LLM_CHAT_TEMPLATE_VICUNA_ORCA; + } + return LLM_CHAT_TEMPLATE_VICUNA; + } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) { + // deepseek-ai/deepseek-coder-33b-instruct + return LLM_CHAT_TEMPLATE_DEEPSEEK; + } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) { + // CohereForAI/c4ai-command-r-plus + return LLM_CHAT_TEMPLATE_COMMAND_R; + } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) { + return LLM_CHAT_TEMPLATE_LLAMA_3; + } else if (tmpl_contains("[gMASK]sop")) { + // chatglm3-6b + return LLM_CHAT_TEMPLATE_CHATGML_3; + } else if (tmpl_contains("[gMASK]")) { + return LLM_CHAT_TEMPLATE_CHATGML_4; + } else if (tmpl_contains(LU8("<用户>"))) { + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + return LLM_CHAT_TEMPLATE_MINICPM; + } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { + return LLM_CHAT_TEMPLATE_DEEPSEEK_2; + } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) { + // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb + // EXAONE-3.0-7.8B-Instruct + return LLM_CHAT_TEMPLATE_EXAONE_3; + } else if (tmpl_contains("rwkv-world")) { + return LLM_CHAT_TEMPLATE_RWKV_WORLD; + } else if (tmpl_contains("<|start_of_role|>")) { + return LLM_CHAT_TEMPLATE_GRANITE; + } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) { + return LLM_CHAT_TEMPLATE_GIGACHAT; + } + return LLM_CHAT_TEMPLATE_UNKNOWN; +} + +// Simple version of "llama_apply_chat_template" that only works with strings +// This function uses heuristic checks to determine commonly used template. It is not a jinja parser. +int32_t llm_chat_apply_template( + llm_chat_template tmpl, + const std::vector & chat, + std::string & dest, bool add_ass) { + // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 + std::stringstream ss; + if (tmpl == LLM_CHAT_TEMPLATE_CHATML) { + // chatml template + for (auto message : chat) { + ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; + } + if (add_ass) { + ss << "<|im_start|>assistant\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) { + // Official mistral 'v7' template + // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7 + for (auto message : chat) { + std::string role(message->role); + std::string content(message->content); + if (role == "system") { + ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]"; + } else if (role == "user") { + ss << "[INST] " << content << "[/INST]"; + } + else { + ss << " " << content << ""; + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 + || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3 + || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) { + // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md + // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md + std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : ""; + std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " "; + bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3; + bool is_inside_turn = false; + for (auto message : chat) { + if (!is_inside_turn) { + ss << leading_space << "[INST]" << trailing_space; + is_inside_turn = true; + } + std::string role(message->role); + std::string content(message->content); + if (role == "system") { + ss << content << "\n\n"; + } else if (role == "user") { + ss << content << leading_space << "[/INST]"; + } else { + ss << trailing_space << (trim_assistant_message ? trim(content) : content) << ""; + is_inside_turn = false; + } + } + } else if ( + tmpl == LLM_CHAT_TEMPLATE_LLAMA_2 + || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS + || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS + || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) { + // llama2 template and its variants + // [variant] support system message + // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 + bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2; + // [variant] add BOS inside history + bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS; + // [variant] trim spaces from the input message + bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP; + // construct the prompt + bool is_inside_turn = true; // skip BOS at the beginning + ss << "[INST] "; + for (auto message : chat) { + std::string content = strip_message ? trim(message->content) : message->content; + std::string role(message->role); + if (!is_inside_turn) { + is_inside_turn = true; + ss << (add_bos_inside_history ? "[INST] " : "[INST] "); + } + if (role == "system") { + if (support_system_message) { + ss << "<>\n" << content << "\n<>\n\n"; + } else { + // if the model does not support system message, we still include it in the first message, but without <> + ss << content << "\n"; + } + } else if (role == "user") { + ss << content << " [/INST]"; + } else { + ss << content << ""; + is_inside_turn = false; + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) { + // Phi 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << message->content << "<|end|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) { + // zephyr template + for (auto message : chat) { + ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) { + // mlabonne/AlphaMonarch-7B template (the is included inside history) + for (auto message : chat) { + std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message + ss << bos << message->role << "\n" << message->content << "\n"; + } + if (add_ass) { + ss << "assistant\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) { + // google/gemma-7b-it + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken + system_prompt = trim(message->content); + continue; + } + // in gemma, "assistant" is "model" + role = role == "assistant" ? "model" : message->role; + ss << "" << role << "\n"; + if (!system_prompt.empty() && role != "model") { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << trim(message->content) << "\n"; + } + if (add_ass) { + ss << "model\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) { + // OrionStarAI/Orion-14B-Chat + std::string system_prompt = ""; + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // there is no system message support, we will merge it with user prompt + system_prompt = message->content; + continue; + } else if (role == "user") { + ss << "Human: "; + if (!system_prompt.empty()) { + ss << system_prompt << "\n\n"; + system_prompt = ""; + } + ss << message->content << "\n\nAssistant: "; + } else { + ss << message->content << ""; + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) { + // openchat/openchat-3.5-0106, + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "<|end_of_turn|>"; + } else { + role[0] = toupper(role[0]); + ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>"; + } + } + if (add_ass) { + ss << "GPT4 Correct Assistant:"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) { + // eachadea/vicuna-13b-1.1 (and Orca variant) + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // Orca-Vicuna variant uses a system prefix + if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) { + ss << "SYSTEM: " << message->content << "\n"; + } else { + ss << message->content << "\n\n"; + } + } else if (role == "user") { + ss << "USER: " << message->content << "\n"; + } else if (role == "assistant") { + ss << "ASSISTANT: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "ASSISTANT:"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) { + // deepseek-ai/deepseek-coder-33b-instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content; + } else if (role == "user") { + ss << "### Instruction:\n" << message->content << "\n"; + } else if (role == "assistant") { + ss << "### Response:\n" << message->content << "\n<|EOT|>\n"; + } + } + if (add_ass) { + ss << "### Response:\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) { + // CohereForAI/c4ai-command-r-plus + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "user") { + ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "assistant") { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } + } + if (add_ass) { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) { + // Llama 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>"; + } + if (add_ass) { + ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { + // chatglm3-6b + ss << "[gMASK]" << "sop"; + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>" << "\n " << message->content; + } + if (add_ass) { + ss << "<|assistant|>"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { + ss << "[gMASK]" << ""; + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>" << "\n" << message->content; + } + if (add_ass) { + ss << "<|assistant|>"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { + // MiniCPM-3B-OpenHermes-2.5-v2-GGUF + for (auto message : chat) { + std::string role(message->role); + if (role == "user") { + ss << LU8("<用户>"); + ss << trim(message->content); + ss << ""; + } else { + ss << trim(message->content); + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) { + // DeepSeek-V2 + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n\n"; + } else if (role == "assistant") { + ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>"); + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { + // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb + // EXAONE-3.0-7.8B-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n"; + } else if (role == "user") { + ss << "[|user|]" << trim(message->content) << "\n"; + } else if (role == "assistant") { + ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n"; + } + } + if (add_ass) { + ss << "[|assistant|]"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) { + // this template requires the model to have "\n\n" as EOT token + for (auto message : chat) { + std::string role(message->role); + if (role == "user") { + ss << "User: " << message->content << "\n\nAssistant:"; + } else { + ss << message->content << "\n\n"; + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) { + // IBM Granite template + for (const auto & message : chat) { + std::string role(message->role); + ss << "<|start_of_role|>" << role << "<|end_of_role|>"; + if (role == "assistant_tool_call") { + ss << "<|tool_call|>"; + } + ss << message->content << "<|end_of_text|>\n"; + } + if (add_ass) { + ss << "<|start_of_role|>assistant<|end_of_role|>\n"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) { + // GigaChat template + bool has_system = !chat.empty() && std::string(chat[0]->role) == "system"; + + // Handle system message if present + if (has_system) { + ss << "" << chat[0]->content << "<|message_sep|>"; + } else { + ss << ""; + } + + // Process remaining messages + for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) { + std::string role(chat[i]->role); + if (role == "user") { + ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>" + << "available functions<|role_sep|>[]<|message_sep|>"; + } else if (role == "assistant") { + ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>"; + } + } + + // Add generation prompt if needed + if (add_ass) { + ss << "assistant<|role_sep|>"; + } + } else { + // template not supported + return -1; + } + dest = ss.str(); + return dest.size(); +} + +// public interface + +int32_t llama_chat_builtin_templates(const char ** output, size_t len) { + auto it = LLM_CHAT_TEMPLATES.begin(); + for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) { + output[i] = it->first.c_str(); + std::advance(it, 1); + } + return (int32_t) LLM_CHAT_TEMPLATES.size(); +} + diff --git a/src/llama-chat.h b/src/llama-chat.h new file mode 100644 index 000000000..96c96b698 --- /dev/null +++ b/src/llama-chat.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include + +enum llm_chat_template { + LLM_CHAT_TEMPLATE_CHATML, + LLM_CHAT_TEMPLATE_LLAMA_2, + LLM_CHAT_TEMPLATE_LLAMA_2_SYS, + LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS, + LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP, + LLM_CHAT_TEMPLATE_MISTRAL_V1, + LLM_CHAT_TEMPLATE_MISTRAL_V3, + LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, + LLM_CHAT_TEMPLATE_MISTRAL_V7, + LLM_CHAT_TEMPLATE_PHI_3, + LLM_CHAT_TEMPLATE_ZEPHYR, + LLM_CHAT_TEMPLATE_MONARCH, + LLM_CHAT_TEMPLATE_GEMMA, + LLM_CHAT_TEMPLATE_ORION, + LLM_CHAT_TEMPLATE_OPENCHAT, + LLM_CHAT_TEMPLATE_VICUNA, + LLM_CHAT_TEMPLATE_VICUNA_ORCA, + LLM_CHAT_TEMPLATE_DEEPSEEK, + LLM_CHAT_TEMPLATE_DEEPSEEK_2, + LLM_CHAT_TEMPLATE_COMMAND_R, + LLM_CHAT_TEMPLATE_LLAMA_3, + LLM_CHAT_TEMPLATE_CHATGML_3, + LLM_CHAT_TEMPLATE_CHATGML_4, + LLM_CHAT_TEMPLATE_MINICPM, + LLM_CHAT_TEMPLATE_EXAONE_3, + LLM_CHAT_TEMPLATE_RWKV_WORLD, + LLM_CHAT_TEMPLATE_GRANITE, + LLM_CHAT_TEMPLATE_GIGACHAT, + LLM_CHAT_TEMPLATE_UNKNOWN, +}; + +struct llama_chat_message; + +llm_chat_template llm_chat_template_from_str(const std::string & name); + +llm_chat_template llm_chat_detect_template(const std::string & tmpl); + +int32_t llm_chat_apply_template( + llm_chat_template tmpl, + const std::vector & chat, + std::string & dest, bool add_ass); diff --git a/src/llama.cpp b/src/llama.cpp index e96d778d9..2f2b4f971 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,5 +1,6 @@ #include "llama-impl.h" +#include "llama-chat.h" #include "llama-mmap.h" #include "llama-context.h" #include "llama-vocab.h" @@ -16,12 +17,6 @@ // TODO: replace with ggml API call #define QK_K 256 -#if __cplusplus >= 202000L - #define LU8(x) (const char*)(u8##x) -#else - #define LU8(x) u8##x -#endif - #include #include #include @@ -74,19 +69,6 @@ std::string format(const char * fmt, ...) { return std::string(buf.data(), size); } -// trim whitespace from the beginning and end of a string -static std::string trim(const std::string & str) { - size_t start = 0; - size_t end = str.size(); - while (start < end && isspace(str[start])) { - start += 1; - } - while (end > start && isspace(str[end - 1])) { - end -= 1; - } - return str.substr(start, end - start); -} - static bool is_float_close(float a, float b, float abs_tol) { // Check for non-negative tolerance if (abs_tol < 0.0) { @@ -17131,481 +17113,6 @@ int32_t llama_detokenize( // chat templates // -static llm_chat_template llama_chat_detect_template(const std::string & tmpl) { - try { - return llm_chat_template_from_str(tmpl); - } catch (const std::out_of_range &) { - // ignore - } - - auto tmpl_contains = [&tmpl](const char * haystack) -> bool { - return tmpl.find(haystack) != std::string::npos; - }; - if (tmpl_contains("<|im_start|>")) { - return LLM_CHAT_TEMPLATE_CHATML; - } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { - if (tmpl_contains("[SYSTEM_PROMPT]")) { - return LLM_CHAT_TEMPLATE_MISTRAL_V7; - } else if ( - // catches official 'v1' template - tmpl_contains("' [INST] ' + system_message") - // catches official 'v3' and 'v3-tekken' templates - || tmpl_contains("[AVAILABLE_TOOLS]") - ) { - // Official mistral 'v1', 'v3' and 'v3-tekken' templates - // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md - // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md - if (tmpl_contains(" [INST]")) { - return LLM_CHAT_TEMPLATE_MISTRAL_V1; - } else if (tmpl_contains("\"[INST]\"")) { - return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN; - } - return LLM_CHAT_TEMPLATE_MISTRAL_V3; - } else { - // llama2 template and its variants - // [variant] support system message - // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 - bool support_system_message = tmpl_contains("<>"); - bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]"); - bool strip_message = tmpl_contains("content.strip()"); - if (strip_message) { - return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP; - } else if (add_bos_inside_history) { - return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS; - } else if (support_system_message) { - return LLM_CHAT_TEMPLATE_LLAMA_2_SYS; - } else { - return LLM_CHAT_TEMPLATE_LLAMA_2; - } - } - } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { - return LLM_CHAT_TEMPLATE_PHI_3; - } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { - return LLM_CHAT_TEMPLATE_FALCON_3; - } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { - return LLM_CHAT_TEMPLATE_ZEPHYR; - } else if (tmpl_contains("bos_token + message['role']")) { - return LLM_CHAT_TEMPLATE_MONARCH; - } else if (tmpl_contains("")) { - return LLM_CHAT_TEMPLATE_GEMMA; - } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) { - // OrionStarAI/Orion-14B-Chat - return LLM_CHAT_TEMPLATE_ORION; - } else if (tmpl_contains("GPT4 Correct ")) { - // openchat/openchat-3.5-0106 - return LLM_CHAT_TEMPLATE_OPENCHAT; - } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) { - // eachadea/vicuna-13b-1.1 (and Orca variant) - if (tmpl_contains("SYSTEM: ")) { - return LLM_CHAT_TEMPLATE_VICUNA_ORCA; - } - return LLM_CHAT_TEMPLATE_VICUNA; - } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) { - // deepseek-ai/deepseek-coder-33b-instruct - return LLM_CHAT_TEMPLATE_DEEPSEEK; - } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) { - // CohereForAI/c4ai-command-r-plus - return LLM_CHAT_TEMPLATE_COMMAND_R; - } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) { - return LLM_CHAT_TEMPLATE_LLAMA_3; - } else if (tmpl_contains("[gMASK]sop")) { - // chatglm3-6b - return LLM_CHAT_TEMPLATE_CHATGML_3; - } else if (tmpl_contains("[gMASK]")) { - return LLM_CHAT_TEMPLATE_CHATGML_4; - } else if (tmpl_contains(LU8("<用户>"))) { - // MiniCPM-3B-OpenHermes-2.5-v2-GGUF - return LLM_CHAT_TEMPLATE_MINICPM; - } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) { - return LLM_CHAT_TEMPLATE_DEEPSEEK_2; - } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) { - // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb - // EXAONE-3.0-7.8B-Instruct - return LLM_CHAT_TEMPLATE_EXAONE_3; - } else if (tmpl_contains("rwkv-world")) { - return LLM_CHAT_TEMPLATE_RWKV_WORLD; - } else if (tmpl_contains("<|start_of_role|>")) { - return LLM_CHAT_TEMPLATE_GRANITE; - } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) { - return LLM_CHAT_TEMPLATE_GIGACHAT; - } else if (tmpl_contains("<|role_start|>")) { - return LLM_CHAT_TEMPLATE_MEGREZ; - } - return LLM_CHAT_TEMPLATE_UNKNOWN; -} - -// Simple version of "llama_apply_chat_template" that only works with strings -// This function uses heuristic checks to determine commonly used template. It is not a jinja parser. -static int32_t llama_chat_apply_template_internal( - const llm_chat_template tmpl, - const std::vector & chat, - std::string & dest, bool add_ass) { - // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527 - std::stringstream ss; - if (tmpl == LLM_CHAT_TEMPLATE_CHATML) { - // chatml template - for (auto message : chat) { - ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n"; - } - if (add_ass) { - ss << "<|im_start|>assistant\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) { - // Official mistral 'v7' template - // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7 - for (auto message : chat) { - std::string role(message->role); - std::string content(message->content); - if (role == "system") { - ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]"; - } else if (role == "user") { - ss << "[INST] " << content << "[/INST]"; - } - else { - ss << " " << content << ""; - } - } - } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 - || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3 - || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) { - // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md - // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md - std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : ""; - std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " "; - bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3; - bool is_inside_turn = false; - for (auto message : chat) { - if (!is_inside_turn) { - ss << leading_space << "[INST]" << trailing_space; - is_inside_turn = true; - } - std::string role(message->role); - std::string content(message->content); - if (role == "system") { - ss << content << "\n\n"; - } else if (role == "user") { - ss << content << leading_space << "[/INST]"; - } else { - ss << trailing_space << (trim_assistant_message ? trim(content) : content) << ""; - is_inside_turn = false; - } - } - } else if ( - tmpl == LLM_CHAT_TEMPLATE_LLAMA_2 - || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS - || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS - || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) { - // llama2 template and its variants - // [variant] support system message - // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2 - bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2; - // [variant] add BOS inside history - bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS; - // [variant] trim spaces from the input message - bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP; - // construct the prompt - bool is_inside_turn = true; // skip BOS at the beginning - ss << "[INST] "; - for (auto message : chat) { - std::string content = strip_message ? trim(message->content) : message->content; - std::string role(message->role); - if (!is_inside_turn) { - is_inside_turn = true; - ss << (add_bos_inside_history ? "[INST] " : "[INST] "); - } - if (role == "system") { - if (support_system_message) { - ss << "<>\n" << content << "\n<>\n\n"; - } else { - // if the model does not support system message, we still include it in the first message, but without <> - ss << content << "\n"; - } - } else if (role == "user") { - ss << content << " [/INST]"; - } else { - ss << content << ""; - is_inside_turn = false; - } - } - } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) { - // Phi 3 - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>\n" << message->content << "<|end|>\n"; - } - if (add_ass) { - ss << "<|assistant|>\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) { - // Falcon 3 - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>\n" << message->content << "\n"; - } - if (add_ass) { - ss << "<|assistant|>\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) { - // zephyr template - for (auto message : chat) { - ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n"; - } - if (add_ass) { - ss << "<|assistant|>\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) { - // mlabonne/AlphaMonarch-7B template (the is included inside history) - for (auto message : chat) { - std::string bos = (message == chat.front()) ? "" : ""; // skip BOS for first message - ss << bos << message->role << "\n" << message->content << "\n"; - } - if (add_ass) { - ss << "assistant\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) { - // google/gemma-7b-it - std::string system_prompt = ""; - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken - system_prompt = trim(message->content); - continue; - } - // in gemma, "assistant" is "model" - role = role == "assistant" ? "model" : message->role; - ss << "" << role << "\n"; - if (!system_prompt.empty() && role != "model") { - ss << system_prompt << "\n\n"; - system_prompt = ""; - } - ss << trim(message->content) << "\n"; - } - if (add_ass) { - ss << "model\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) { - // OrionStarAI/Orion-14B-Chat - std::string system_prompt = ""; - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - // there is no system message support, we will merge it with user prompt - system_prompt = message->content; - continue; - } else if (role == "user") { - ss << "Human: "; - if (!system_prompt.empty()) { - ss << system_prompt << "\n\n"; - system_prompt = ""; - } - ss << message->content << "\n\nAssistant: "; - } else { - ss << message->content << ""; - } - } - } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) { - // openchat/openchat-3.5-0106, - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - ss << message->content << "<|end_of_turn|>"; - } else { - role[0] = toupper(role[0]); - ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>"; - } - } - if (add_ass) { - ss << "GPT4 Correct Assistant:"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) { - // eachadea/vicuna-13b-1.1 (and Orca variant) - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - // Orca-Vicuna variant uses a system prefix - if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) { - ss << "SYSTEM: " << message->content << "\n"; - } else { - ss << message->content << "\n\n"; - } - } else if (role == "user") { - ss << "USER: " << message->content << "\n"; - } else if (role == "assistant") { - ss << "ASSISTANT: " << message->content << "\n"; - } - } - if (add_ass) { - ss << "ASSISTANT:"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) { - // deepseek-ai/deepseek-coder-33b-instruct - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - ss << message->content; - } else if (role == "user") { - ss << "### Instruction:\n" << message->content << "\n"; - } else if (role == "assistant") { - ss << "### Response:\n" << message->content << "\n<|EOT|>\n"; - } - } - if (add_ass) { - ss << "### Response:\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) { - // CohereForAI/c4ai-command-r-plus - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; - } else if (role == "user") { - ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; - } else if (role == "assistant") { - ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; - } - } - if (add_ass) { - ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) { - // Llama 3 - for (auto message : chat) { - std::string role(message->role); - ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>"; - } - if (add_ass) { - ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) { - // chatglm3-6b - ss << "[gMASK]" << "sop"; - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n " << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) { - ss << "[gMASK]" << ""; - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n" << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { - // MiniCPM-3B-OpenHermes-2.5-v2-GGUF - for (auto message : chat) { - std::string role(message->role); - if (role == "user") { - ss << LU8("<用户>"); - ss << trim(message->content); - ss << ""; - } else { - ss << trim(message->content); - } - } - } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) { - // DeepSeek-V2 - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - ss << message->content << "\n\n"; - } else if (role == "user") { - ss << "User: " << message->content << "\n\n"; - } else if (role == "assistant") { - ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>"); - } - } - if (add_ass) { - ss << "Assistant:"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) { - // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb - // EXAONE-3.0-7.8B-Instruct - for (auto message : chat) { - std::string role(message->role); - if (role == "system") { - ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n"; - } else if (role == "user") { - ss << "[|user|]" << trim(message->content) << "\n"; - } else if (role == "assistant") { - ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n"; - } - } - if (add_ass) { - ss << "[|assistant|]"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) { - // this template requires the model to have "\n\n" as EOT token - for (auto message : chat) { - std::string role(message->role); - if (role == "user") { - ss << "User: " << message->content << "\n\nAssistant:"; - } else { - ss << message->content << "\n\n"; - } - } - } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) { - // IBM Granite template - for (const auto & message : chat) { - std::string role(message->role); - ss << "<|start_of_role|>" << role << "<|end_of_role|>"; - if (role == "assistant_tool_call") { - ss << "<|tool_call|>"; - } - ss << message->content << "<|end_of_text|>\n"; - } - if (add_ass) { - ss << "<|start_of_role|>assistant<|end_of_role|>\n"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) { - // GigaChat template - bool has_system = !chat.empty() && std::string(chat[0]->role) == "system"; - - // Handle system message if present - if (has_system) { - ss << "" << chat[0]->content << "<|message_sep|>"; - } else { - ss << ""; - } - - // Process remaining messages - for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) { - std::string role(chat[i]->role); - if (role == "user") { - ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>" - << "available functions<|role_sep|>[]<|message_sep|>"; - } else if (role == "assistant") { - ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>"; - } - } - - // Add generation prompt if needed - if (add_ass) { - ss << "assistant<|role_sep|>"; - } - } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) { - // Megrez template - for (auto message : chat) { - std::string role(message->role); - ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>"; - } - - if (add_ass) { - ss << "<|role_start|>assistant<|role_end|>"; - } - } else { - // template not supported - return -1; - } - dest = ss.str(); - return dest.size(); -} - int32_t llama_chat_apply_template( const struct llama_model * model, const char * tmpl, @@ -17625,7 +17132,7 @@ int32_t llama_chat_apply_template( } else { // worst case: there is no information about template, we will use chatml by default - curr_tmpl = "chatml"; // see llama_chat_apply_template_internal + curr_tmpl = "chatml"; // see llm_chat_apply_template } } @@ -17637,11 +17144,11 @@ int32_t llama_chat_apply_template( } std::string formatted_chat; - llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl); + llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl); if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) { return -1; } - int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass); + int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass); if (res < 0) { return res; }