convert-gptneox-h5-to-gguf.py : gpt2bpe tokenizer

2024-12-27 03:44:35 +00:00 · 2023-08-04 03:55:23 +02:00 · 2023-08-04 03:55:23 +02:00 · 2922280a1a
commit 2922280a1a
parent 6691aa8797
1 changed files with 73 additions and 27 deletions
--- a/convert-gptneox-h5-to-gguf.py
+++ b/convert-gptneox-h5-to-gguf.py
@ -1,14 +1,36 @@
 # Quick and dirty HF gptneox--> gguf conversion

 import gguf
+import os
 import sys
 import struct
 import json
 import numpy as np
 from typing import Any, List
 from pathlib import Path
-from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM

+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))

 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
@ -20,7 +42,7 @@ if len(sys.argv) < 3:
 # output in the same directory as the model
 dir_model = sys.argv[1]
 fname_out = sys.argv[1] + "/ggml-model.bin"
-
+last_dir = os.path.basename(os.path.normpath(dir_model))

 # possible tensor data types
 #   ftype == 0 -> float32
@ -37,6 +59,8 @@ if len(sys.argv) > 2:
        sys.exit(1)
    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"

+print("gguf: loading model "+last_dir)
+
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

@ -44,17 +68,17 @@ if hparams["architectures"][0] != "GPTNeoXForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0] )
    sys.exit()

+
 model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
 list_vars = model.state_dict()

 gguf_writer = gguf.GGUFWriter.open(fname_out)

-
-print("gguf: add key-values, metadata")
+print("gguf: add metadata")

 llm_arch = "gptneox"

-gguf_writer.add_name("pythia-70b-deduped")
+gguf_writer.add_name(last_dir)
 gguf_writer.add_description("gguf test model")
 gguf_writer.add_architecture(llm_arch)
 gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
@ -68,28 +92,55 @@ gguf_writer.add_layer_norm_eps(llm_arch, hparams["layer_norm_eps"])

 # TOKENIZATION

-print("gguf: add key-values, tokenizer")
+print("gguf: add tokenizer")

 tokens: List[str] = []
 merges: List[str] = []

+
 if Path(dir_model + "/tokenizer.json").is_file():
-    # vocab type gpt2
-    print("gguf: adding gpt2 tokenizer vocab")
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: adding gpt2 tokenizer merges")

    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-        tokenizer = json.load(f)
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]

-    for key in tokenizer["model"]["vocab"]:
-        tokens.append(key)
-
-    merges = tokenizer["model"]["merges"]
-
-    gguf_writer.add_tokenizer_model("gpt2")
-    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_merges(merges)

-    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+    print("gguf: adding gpt2 tokenizer vocab")
+
+    vocab_size = len( tokenizer_json["model"]["vocab"] )
+
+    # from ggllm.cpp falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            padding_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(padding_token)
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
        print("gguf: adding special token ids")

        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
@ -98,27 +149,27 @@ if Path(dir_model + "/tokenizer.json").is_file():
        # find special token ids

        if "bos_token" in tokenizer_config:
-            for key in tokenizer["added_tokens"]:
+            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]:
                    gguf_writer.add_bos_token_id(key["id"])

        if "eos_token" in tokenizer_config:
-            for key in tokenizer["added_tokens"]:
+            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]:
                    gguf_writer.add_eos_token_id(key["id"])

        if "unk_token" in tokenizer_config:
-            for key in tokenizer["added_tokens"]:
+            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]:
                    gguf_writer.add_unk_token_id(key["id"])

        if "sep_token" in tokenizer_config:
-            for key in tokenizer["added_tokens"]:
+            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]:
                    gguf_writer.add_sep_token_id(key["id"])

        if "pad_token" in tokenizer_config:
-            for key in tokenizer["added_tokens"]:
+            for key in tokenizer_json["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]:
                    gguf_writer.add_pad_token_id(key["id"])

@ -165,11 +216,9 @@ print("gguf: write tensor data")

 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
-#    print("Process tensor: " + name + " with shape: ", data.shape)

    # we don't need these
    if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
-#        print("  Skip tensor: " + name)
        continue

    n_dims = len(data.shape)
@ -178,16 +227,13 @@ for name in list_vars.keys():
    ftype_cur = 0
    if ftype != 0:
        if name.endswith(".weight") and n_dims == 2:
-#            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
-#            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
-#            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0