From 56df1fcdcb6e9abf74e11ea05741fa65dbc020be Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 7 Jul 2024 16:13:35 -0400 Subject: [PATCH] llama : fix detection of control-like user-defined tokens --- src/llama.cpp | 3 ++- tests/test-tokenizer-0.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3dfbf792b..1794ec2bd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5513,7 +5513,8 @@ static void llm_load_vocab( } } - if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && token_data.text.find('<') && token_data.text.rfind('>')) { + if ((token_data.attr & LLAMA_TOKEN_ATTR_USER_DEFINED) && !token_data.text.empty() && + token_data.text.front() == '<' && token_data.text.back() == '>') { // Some models mark some added tokens which ought to be control tokens as not special. // (e.g. command-r, command-r-plus, deepseek-coder) // TODO: should this be fixed in the convert script instead? diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 0c2d7781b..d3d21331b 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -195,7 +195,7 @@ int main(int argc, char **argv) { const bool add_special = false; for (const auto & test_kv : k_tests) { - const std::vector res = llama_tokenize(ctx, test_kv.first, add_special); + const std::vector res = llama_tokenize(ctx, test_kv.first, add_special, false); printf("\n"); printf("src: '%s'\n", test_kv.first.c_str()); @@ -253,7 +253,7 @@ int main(int argc, char **argv) { { const auto t_start = ggml_time_us(); - res = llama_tokenize(ctx, text, add_special); + res = llama_tokenize(ctx, text, add_special, false); const auto t_end = ggml_time_us();