From e7a741695c5a23ef6d591555e7cc255f9b79d690 Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Tue, 1 Aug 2023 14:30:00 +0200 Subject: [PATCH] convert-gptneox-h5-to-gguf.py : Special tokens --- convert-gptneox-h5-to-gguf.py | 42 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/convert-gptneox-h5-to-gguf.py b/convert-gptneox-h5-to-gguf.py index 266199509..f255cb7d7 100644 --- a/convert-gptneox-h5-to-gguf.py +++ b/convert-gptneox-h5-to-gguf.py @@ -58,7 +58,7 @@ for name in list_vars.keys(): gguf_writer = gguf.GGUFWriter.open(fname_out) # This must be changed when adding/deleting kv -kv_count = 14 +kv_count = 17 print("tensors " + str(tensor_count) + " kv " + str(kv_count)) @@ -101,9 +101,43 @@ if Path(dir_model + "/tokenizer.json").is_file(): merges = tokenizer["model"]["merges"] -gguf_writer.write_tokenizer_model("gpt2") -gguf_writer.write_token_list(tokens) -gguf_writer.write_token_merges(merges) + gguf_writer.write_tokenizer_model("gpt2") + gguf_writer.write_token_list(tokens) + gguf_writer.write_token_merges(merges) + + if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): + print("Adding special token ids") + + with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: + tokenizer_config = json.load(f) + + # find special token ids + + if "bos_token" in tokenizer_config: + for key in tokenizer["added_tokens"]: + if key["content"] == tokenizer_config["bos_token"]: + gguf_writer.write_uint32("tokenizer.ggml.bos_token_id", key["id"] ) + + if "eos_token" in tokenizer_config: + for key in tokenizer["added_tokens"]: + if key["content"] == tokenizer_config["eos_token"]: + gguf_writer.write_uint32("tokenizer.ggml.eos_token_id", key["id"] ) + + if "unk_token" in tokenizer_config: + for key in tokenizer["added_tokens"]: + if key["content"] == tokenizer_config["unk_token"]: + gguf_writer.write_uint32("tokenizer.ggml.unknown_token_id", key["id"] ) + + if "sep_token" in tokenizer_config: + for key in tokenizer["added_tokens"]: + if key["content"] == tokenizer_config["sep_token"]: + gguf_writer.write_uint32("tokenizer.ggml.separator_token_id", key["id"] ) + + if "pad_token" in tokenizer_config: + for key in tokenizer["added_tokens"]: + if key["content"] == tokenizer_config["pad_token"]: + gguf_writer.write_uint32("tokenizer.ggml.padding_token_id", key["id"] ) + # TENSORS