llama : add BPE pre-tokenization for Qwen2 (#7114)

* Add BPE pre-tokenization for Qwen2. * minor : fixes --------- Co-authored-by: Ren Xuancheng <17811943+jklj077@users.noreply.github.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-24 10:24:35 +00:00 · 2024-05-08 20:06:43 +08:00 · 2024-05-08 20:06:43 +08:00 · 229ffff872
commit 229ffff872
parent 1fd9c1741d
8 changed files with 167 additions and 2 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -67,6 +67,7 @@ models = [
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
 ]
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -314,6 +314,9 @@ class Model(ABC):
        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
            res = "command-r"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
            res = "olmo"
--- a/llama.cpp
+++ b/llama.cpp
@ -4391,6 +4391,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "command-r") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
            } else if (
                tokenizer_pre == "qwen2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else if (
                tokenizer_pre == "olmo") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe {
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json
                            // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                        });
                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
--- a/llama.h
+++ b/llama.h
@ -81,8 +81,9 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 10,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 11,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
    };
    // note: these values should be synchronized with ggml_rope
--- a/models/ggml-vocab-qwen2.gguf
+++ b/models/ggml-vocab-qwen2.gguf
--- a/models/ggml-vocab-qwen2.gguf.inp
+++ b/models/ggml-vocab-qwen2.gguf.inp
@ -0,0 +1,106 @@
 ied 4 ½ months
 __ggml_vocab_test__
 Führer
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-qwen2.gguf.out
+++ b/models/ggml-vocab-qwen2.gguf.out
@ -0,0 +1,43 @@
 1122 220 19 220 26062 3951
 37 50753 261
 220
 256
 262
 197
 198
 271
 1406
 1572
 9707 1879
 21927 1879
 9707 4337
 21927 4337
 21927 4337 0
 9707 11 1879 0
 21927 11 1879 0
 419 374 11162 99 247 13 10821
 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
 78762 14144 1456 13073 63471 33594 3038 133178 79012
 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
 9707
 21927
 220 21927
 256 21927
 262 21927
 262 21927 198 262 21927
 320
 198 284
 6 11385
 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
 18
 18 18
 18 18 18
 18 18 18 18
 18 18 18 18 18
 18 18 18 18 18 18
 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18 18
 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -84,6 +84,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)