support glm-4-9b-chat

Signed-off-by: XingXing Qiao <qiaoxx@dingdao.com>
2025-01-13 20:14:29 +00:00 · 2024-06-17 10:08:52 +08:00 · 2024-06-17 10:08:52 +08:00 · 1fc5bf5bcb
commit 1fc5bf5bcb
parent f3bc337f43
5 changed files with 116 additions and 7 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -476,6 +476,9 @@ class Model:
        if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
            res = "smaug-bpe"
+        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
+            res = "chatglm-bpe"

        if res is None:
            logger.warning("\n")
@ -2714,7 +2717,7 @@ class DeepseekV2Model(Model):
 class ChatGLMModel(Model):
    model_arch = gguf.MODEL_ARCH.CHATGLM

-    def set_vocab(self):
+    def set_vocab_chatglm3(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytearray] = []
@ -2725,7 +2728,8 @@ class ChatGLMModel(Model):
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
        assert max(tokenizer.get_vocab().values()) < vocab_size
-
+        print(vocab_size)
+        print(max(tokenizer.get_vocab().values()))
        for token_id in range(vocab_size):
            piece = tokenizer._convert_id_to_token(token_id)
            if token_id == 0:
@ -2774,6 +2778,91 @@ class ChatGLMModel(Model):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)

+    @staticmethod
+    def token_bytes_to_string(b):
+        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
+        byte_encoder = bytes_to_unicode()
+        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
+
+    @staticmethod
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
+        parts = [bytes([b]) for b in token]
+        while True:
+            min_idx = None
+            min_rank = None
+            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+                rank = mergeable_ranks.get(pair[0] + pair[1])
+                if rank is not None and (min_rank is None or rank < min_rank):
+                    min_idx = i
+                    min_rank = rank
+            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+                break
+            assert min_idx is not None
+            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
+        return parts
+
+    def set_vocab(self):
+        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
+            self.set_vocab_chatglm3()
+            return
+
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[str] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["padded_vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) >= 2 and len(merged) <= 7
+            merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
+
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
+        added_vocab = tokenizer.get_added_vocab()
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                tokens.append(f"[PAD{i}]")
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                if tokenizer.added_tokens_decoder[i].special:
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.USER_DEFINED)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.chat_template = "ChatGLM4"
+        special_vocab.merges = merges
+        # only add special tokens when they were not already loaded from config.json
+        if len(special_vocab.special_token_ids) == 0:
+            special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
+            special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        # this one is usually not in config.json anyway
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.dir_model.name)
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@ -2934,7 +3023,8 @@ def main() -> None:
    with torch.inference_mode():
        model_class = Model.from_model_architecture(hparams["architectures"][0])
        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
-
+        print(model_class)
+        print(model_instance)
        logger.info("Set model parameters")
        model_instance.set_gguf_parameters()

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3056,7 +3056,7 @@ int main(int argc, char ** argv) {
        chat.push_back({{"role", "user"},      {"content", "Hello"}});
        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
-
+        printf("sparams.chat_template: #%s#\n", sparams.chat_template.c_str());
        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);

        LOG_INFO("chat template", {
--- a/llama.cpp
+++ b/llama.cpp
@ -4508,6 +4508,7 @@ static void llm_load_hparams(
            ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
            switch (hparams.n_layer) {
                case 28: model.type = e_model::MODEL_7B; break;
+                case 40: model.type = e_model::MODEL_8B; break;
                default: model.type = e_model::MODEL_UNKNOWN;
            }
        } break;
@ -4636,9 +4637,9 @@ static void llm_load_vocab(
            if (merges_keyidx == -1) {
                throw std::runtime_error("cannot find tokenizer merges in model file\n");
            }
-
+            printf("merges_keyidx: %d\n", merges_keyidx);
            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
-
+            printf("n_merges: %d\n", n_merges);
            for (int i = 0; i < n_merges; i++) {
                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
                GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
@ -4728,6 +4729,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "smaug-bpe") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
+            } else if (
+                tokenizer_pre == "chatglm-bpe") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@ -11449,7 +11453,7 @@ struct llm_build_context {
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
- 
+                //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
                Qcur = ggml_rope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@ -13032,6 +13036,7 @@ struct llm_tokenizer_bpe {
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
                    case LLAMA_VOCAB_PRE_TYPE_SMAUG:
+                    case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
                        word_collection = unicode_regex_split(text, {
                            // same as llama3
                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@ -18741,6 +18746,15 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|assistant|>";
        }
+    } else if (tmpl == "ChatGLM4") {
+        ss << "[gMASK]" << "<sop>";
+        for (auto message : chat) {
+            std::string role(message->role);
+            ss << "<|" << role << "|>" << "\n" << message->content;
+        }
+        if (add_ass) {
+            ss << "<|assistant|>";
+        }
    } else {
        // template not supported
        return -1;
--- a/llama.h
+++ b/llama.h
@ -86,6 +86,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 15,
    };

    // note: these values should be synchronized with ggml_rope
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -59,6 +59,8 @@ int main(void) {
        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
        // ChatGLM3
        "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+        // ChatGLM4
+        "ChatGLM4",
    };
    std::vector<std::string> expected_output = {
        // teknium/OpenHermes-2.5-Mistral-7B
@ -97,6 +99,8 @@ int main(void) {
        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
        // ChatGLM3
        "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
+        // ChatGLM4
+        "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;