diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py index 21b9fd507..2d2fa2329 100755 --- a/convert-mpt-hf-to-gguf.py +++ b/convert-mpt-hf-to-gguf.py @@ -128,15 +128,22 @@ vocab_size = hparams["vocab_size"] # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py tokenizer = AutoTokenizer.from_pretrained(dir_model) +added_vocab = tokenizer.get_added_vocab() reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} for i in range(vocab_size): - tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + # NOTE: wouldn't we like to distinguish CONTROL tokens here? + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) diff --git a/llama.cpp b/llama.cpp index 8d52eaf62..c63e6251c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -975,14 +975,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } -static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); if (n_tokens < 0) { result.resize(-n_tokens); int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); GGML_ASSERT(check == -n_tokens); - } else { + } + else { result.resize(n_tokens); } @@ -1202,10 +1203,10 @@ struct llama_vocab { id special_eot_id = 32010; int find_bpe_rank(std::string token_left, std::string token_right) const { - replace_all(token_left, " ", "\u0120"); - replace_all(token_left, "\n", "\u010A"); - replace_all(token_right, " ", "\u0120"); - replace_all(token_right, "\n", "\u010A"); + GGML_ASSERT(token_left.find(" ") == std::string::npos); + GGML_ASSERT(token_left.find("\n") == std::string::npos); + GGML_ASSERT(token_right.find(" ") == std::string::npos); + GGML_ASSERT(token_right.find("\n") == std::string::npos); auto it = bpe_ranks.find(std::make_pair(token_left, token_right)); if (it == bpe_ranks.end()) { @@ -7499,7 +7500,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_str(ctx, id); + const std::string piece = llama_token_to_piece(ctx, id); if (id == eos) { if (!allow_eos) { candidates->data[i].logit = -INFINITY; @@ -7711,7 +7712,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string piece = llama_token_to_str(ctx, token); + const std::string piece = llama_token_to_piece(ctx, token); // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8); diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf new file mode 100644 index 000000000..6affa34bd Binary files /dev/null and b/models/ggml-vocab-mpt.gguf differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 61407e573..1c73de0a3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -31,6 +31,7 @@ llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE llama_build_executable(test-tokenizer-1-bpe.cpp) llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) +llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-grad0.cpp) # SLOW