From 784a14aa496a66cc43e76014a1bf4333b4a2a55a Mon Sep 17 00:00:00 2001 From: Sukriti Sharma Date: Sat, 7 Dec 2024 00:02:14 -0700 Subject: [PATCH] convert : add support for Roberta embeddings (#10695) --- convert_hf_to_gguf.py | 8 +- convert_hf_to_gguf_update.py | 1 + models/ggml-vocab-roberta-bpe.gguf.inp | 112 +++++++++++++++++++++++++ models/ggml-vocab-roberta-bpe.gguf.out | 46 ++++++++++ 4 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 models/ggml-vocab-roberta-bpe.gguf.inp create mode 100644 models/ggml-vocab-roberta-bpe.gguf.out diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f1419e29..a4eece934 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -661,6 +661,9 @@ class Model: if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" if res is None: logger.warning("\n") @@ -2533,7 +2536,7 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel") +@Model.register("BertModel", "CamembertModel", "RobertaModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2574,7 +2577,8 @@ class BertModel(Model): # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) # convert to phantom space vocab def phantom(tok): diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ce3c571df..aa10e5db7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -103,6 +103,7 @@ models = [ {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, + {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, ] diff --git a/models/ggml-vocab-roberta-bpe.gguf.inp b/models/ggml-vocab-roberta-bpe.gguf.inp new file mode 100644 index 000000000..9baf7d77a --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-roberta-bpe.gguf.out b/models/ggml-vocab-roberta-bpe.gguf.out new file mode 100644 index 000000000..f181ac3dc --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.out @@ -0,0 +1,46 @@ + 2550 204 18430 377 + 597 2768 298 8564 + + 1437 + 1437 1437 + 1437 1437 1437 + 50117 + 50118 + 50140 + 50140 50118 + 50117 50118 + 31414 232 + 20920 232 + 31414 623 + 20920 623 + 20920 623 328 + 31414 6 232 328 + 20920 6 232 328 + 42 16 8103 18164 27 4 49317 + 605 40976 262 10109 18474 385 29 36807 6455 + 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 + 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171 + 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43 + 31414 + 20920 + 1437 20920 + 1437 1437 20920 + 1437 1437 1437 20920 + 1437 1437 1437 20920 50118 1437 1437 1437 20920 + 36 + 50118 5457 + 108 3567 + 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 + 32376 12846 + 246 + 3103 + 25631 + 46152 + 3103 25631 + 46152 3103 + 46152 25631 + 46152 46152 + 46152 3103 25631 + 347 1376 2023 12410 102 16376 1376 2023 6382 90 + 9553 5954 + 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574