llama.cpp/convert-llama-7b-pth-to-gguf.py

# 7b pth llama --> gguf conversion, GQA/70b not supported
# Only models with a single datafile are supported, like 7B
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model

import gguf
import os
import sys
import struct
import json
import numpy as np
import torch

from typing import Any, List
from pathlib import Path
from sentencepiece import SentencePieceProcessor

#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'


def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("consolidated."):
            num_parts += 1

    if num_parts > 0:
        print("gguf: found " + str(num_parts) + " model parts")
    return num_parts


if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")

    sys.exit(1)


# output in the same directory as the model
dir_model = sys.argv[1]
last_dir = os.path.basename(os.path.normpath(dir_model))


# possible tensor data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16

# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))

        sys.exit(1)

fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"

print("gguf: loading model "+last_dir)

with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

if hparams["architectures"][0] != "LlamaForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])
    sys.exit()

# get number of model parts
num_parts = count_model_parts(dir_model)

if num_parts > 1:
    print("gguf: Only models with a single datafile are supported.")

    sys.exit()
llm_arch = "llama"
gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)


print("gguf: get model metadata")

block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]

if "num_key_value_heads" in hparams:
    head_count_kv = hparams["num_key_value_heads"]
else:
    head_count_kv = head_count

if "_name_or_path" in hparams:
    hf_repo = hparams["_name_or_path"]
else:
    hf_repo = ""

gguf_writer.add_architecture()
gguf_writer.add_name(last_dir)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])


# TOKENIZATION

print("gguf: get tokenizer metadata")

tokens: List[bytes] = []
scores: List[float] = []
toktypes: List[int] = []

if Path(dir_model + "/tokenizer.model").is_file():
    # vocab type sentencepiece
    print("gguf: get sentencepiece tokenizer vocab and scores")

    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")

    for i in range(tokenizer.vocab_size()):
        text: bytes
        score: float

        piece = tokenizer.id_to_piece(i)
        text = piece.encode("utf-8")
        score = tokenizer.get_score(i)

        toktype = 1  # defualt to normal token type
        if tokenizer.is_unknown(i):
            toktype = 2
        if tokenizer.is_control(i):
            toktype = 3

        # TODO: How to determinate if a token is user defined?
        # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
        # if tokenizer.is_user_defined(i): toktype = 4

        if tokenizer.is_unused(i):
            toktype = 5
        if tokenizer.is_byte(i):
            toktype = 6

        tokens.append(text)
        scores.append(score)
        toktypes.append(toktype)

    gguf_writer.add_tokenizer_model("llama")
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)

if Path(dir_model + "/tokenizer.json").is_file():
    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
        tokenizer = json.load(f)

    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
        print("gguf: get special token ids")

        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
            tokenizer_config = json.load(f)

        # find special token ids

        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["bos_token"]["content"]:
                    gguf_writer.add_bos_token_id(key["id"])

        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["eos_token"]["content"]:
                    gguf_writer.add_eos_token_id(key["id"])

        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["unk_token"]["content"]:
                    gguf_writer.add_unk_token_id(key["id"])

        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["sep_token"]["content"]:
                    gguf_writer.add_sep_token_id(key["id"])

        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
            for key in tokenizer["added_tokens"]:
                if key["content"] == tokenizer_config["pad_token"]["content"]:
                    gguf_writer.add_pad_token_id(key["id"])


# TENSORS

tensor_map = gguf.get_tensor_name_map(block_count)

# tensor info
print("gguf: get tensor metadata")

part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))

for part_name in part_names:
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

    for name in model_part.keys():
        data = model_part[name]

        # we don't need these
        if name == "rope.freqs":
            continue

        # convert any unsupported data types to float32
        if data.dtype != torch.float16 and data.dtype != torch.float32:
            data = data.to(torch.float32)

        data = data.squeeze().numpy()

        # map tensor names
        if name.endswith(".weight") and name[:-7] in tensor_map:
            name = tensor_map[name[:-7]] + ".weight"
        elif name.endswith(".bias") and name[:-5] in tensor_map:
            name = tensor_map[name[:-5]] + ".bias"
        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()

        n_dims = len(data.shape)
        data_dtype = data.dtype
        old_dtype = data_dtype

        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data.dtype == np.float16:
            data_dtype = np.float32

        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
            data_dtype = np.float32

        # if f16 desired, convert any float32 2-dim weight tensors to float16
        if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data_dtype = np.float16

        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))

        data = data.astype(data_dtype)

        gguf_writer.add_tensor(name, data)


print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()

gguf_writer.close()


print("gguf: model successfully exported to '" + fname_out + "'")
print("")
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`# 7b pth llama --> gguf conversion, GQA/70b not supported`
			`# Only models with a single datafile are supported, like 7B`
			`# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model`

			`import gguf`
			`import os`
			`import sys`
			`import struct`
			`import json`
			`import numpy as np`
			`import torch`
gguf.py : merge all files in gguf.py 2023-08-16 16:55:49 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`from typing import Any, List`
			`from pathlib import Path`
			`from sentencepiece import SentencePieceProcessor`

			`#NDArray = np.ndarray[Any, Any]`
			`# compatible with python < 3.9`
			`NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`def count_model_parts(dir_model: str) -> int:`
			`num_parts = 0`
			`for filename in os.listdir(dir_model):`
			`if filename.startswith("consolidated."):`
			`num_parts += 1`

			`if num_parts > 0:`
			`print("gguf: found " + str(num_parts) + " model parts")`
			`return num_parts`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`if len(sys.argv) < 3:`
			`print("Usage: convert-h5-to-ggml.py dir-model ftype\n")`
			`print(" ftype == 0 -> float32")`
			`print(" ftype == 1 -> float16")`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`sys.exit(1)`


			`# output in the same directory as the model`
			`dir_model = sys.argv[1]`
			`last_dir = os.path.basename(os.path.normpath(dir_model))`


			`# possible tensor data types`
			`# ftype == 0 -> float32`
			`# ftype == 1 -> float16`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`# map from ftype to string`
			`ftype_str = ["f32", "f16"]`

			`ftype = 1`
			`if len(sys.argv) > 2:`
			`ftype = int(sys.argv[2])`
			`if ftype < 0 or ftype > 1:`
			`print("Invalid ftype: " + str(ftype))`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`sys.exit(1)`

			`fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"`

			`print("gguf: loading model "+last_dir)`

			`with open(dir_model + "/config.json", "r", encoding="utf-8") as f:`
			`hparams = json.load(f)`

			`if hparams["architectures"][0] != "LlamaForCausalLM":`
			`print("Model architecture not supported: " + hparams["architectures"][0])`
			`sys.exit()`

			`# get number of model parts`
			`num_parts = count_model_parts(dir_model)`

			`if num_parts > 1:`
			`print("gguf: Only models with a single datafile are supported.")`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`sys.exit()`
			`llm_arch = "llama"`
			`gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00

			`print("gguf: get model metadata")`

			`block_count = hparams["num_hidden_layers"]`
			`head_count = hparams["num_attention_heads"]`

			`if "num_key_value_heads" in hparams:`
			`head_count_kv = hparams["num_key_value_heads"]`
			`else:`
			`head_count_kv = head_count`

			`if "_name_or_path" in hparams:`
			`hf_repo = hparams["_name_or_path"]`
			`else:`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`hf_repo = ""`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`gguf_writer.add_architecture()`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`gguf_writer.add_name(last_dir)`
			`gguf_writer.add_source_hf_repo(hf_repo)`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`gguf_writer.add_tensor_data_layout("Meta AI original pth")`
			`gguf_writer.add_context_length(hparams["max_position_embeddings"])`
			`gguf_writer.add_embedding_length(hparams["hidden_size"])`
			`gguf_writer.add_block_count(block_count)`
			`gguf_writer.add_feed_forward_length(hparams["intermediate_size"])`
			`gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])`
			`gguf_writer.add_head_count(head_count)`
			`gguf_writer.add_head_count_kv(head_count_kv)`
			`gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00

			`# TOKENIZATION`

			`print("gguf: get tokenizer metadata")`

convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`tokens: List[bytes] = []`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`scores: List[float] = []`
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`toktypes: List[int] = []`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`if Path(dir_model + "/tokenizer.model").is_file():`
			`# vocab type sentencepiece`
			`print("gguf: get sentencepiece tokenizer vocab and scores")`

			`tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")`

			`for i in range(tokenizer.vocab_size()):`
			`text: bytes`
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`score: float`

			`piece = tokenizer.id_to_piece(i)`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`text = piece.encode("utf-8")`
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`score = tokenizer.get_score(i)`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`toktype = 1 # defualt to normal token type`
			`if tokenizer.is_unknown(i):`
			`toktype = 2`
			`if tokenizer.is_control(i):`
			`toktype = 3`
gguf : deduplicate (#2629) * gguf : better type names * dedup : CPU + Metal is working * ggml : fix warnings about unused results * llama.cpp : fix line feed and compiler warning * llama : fix strncpy warning + note token_to_str does not write null * llama : restore the original load/save session implementation Will migrate this to GGUF in the future * convert-llama-h5-to-gguf.py : support alt ctx param name * ggml : assert when using ggml_mul with non-F32 src1 * examples : dedup simple --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> 2023-08-16 16:25:29 +00:00
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`# TODO: How to determinate if a token is user defined?`
			`# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto`
			`# if tokenizer.is_user_defined(i): toktype = 4`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`if tokenizer.is_unused(i):`
			`toktype = 5`
			`if tokenizer.is_byte(i):`
			`toktype = 6`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`tokens.append(text)`
			`scores.append(score)`
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`toktypes.append(toktype)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`gguf_writer.add_tokenizer_model("llama")`
			`gguf_writer.add_token_list(tokens)`
			`gguf_writer.add_token_scores(scores)`
convert-llama-7b-pth-to-gguf.py : add token types 2023-08-14 20:10:50 +00:00			`gguf_writer.add_token_types(toktypes)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`if Path(dir_model + "/tokenizer.json").is_file():`
			`with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:`
			`tokenizer = json.load(f)`

			`if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():`
			`print("gguf: get special token ids")`

			`with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:`
			`tokenizer_config = json.load(f)`

			`# find special token ids`

			`if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:`
			`for key in tokenizer["added_tokens"]:`
			`if key["content"] == tokenizer_config["bos_token"]["content"]:`
			`gguf_writer.add_bos_token_id(key["id"])`

			`if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:`
			`for key in tokenizer["added_tokens"]:`
			`if key["content"] == tokenizer_config["eos_token"]["content"]:`
			`gguf_writer.add_eos_token_id(key["id"])`

			`if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:`
			`for key in tokenizer["added_tokens"]:`
			`if key["content"] == tokenizer_config["unk_token"]["content"]:`
			`gguf_writer.add_unk_token_id(key["id"])`

			`if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:`
			`for key in tokenizer["added_tokens"]:`
			`if key["content"] == tokenizer_config["sep_token"]["content"]:`
			`gguf_writer.add_sep_token_id(key["id"])`

			`if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:`
			`for key in tokenizer["added_tokens"]:`
			`if key["content"] == tokenizer_config["pad_token"]["content"]:`
			`gguf_writer.add_pad_token_id(key["id"])`


			`# TENSORS`

gguf.py : merge all files in gguf.py 2023-08-16 16:55:49 +00:00			`tensor_map = gguf.get_tensor_name_map(block_count)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`# tensor info`
			`print("gguf: get tensor metadata")`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`for part_name in part_names:`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`print("gguf: loading model part '" + part_name + "'")`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")`

			`for name in model_part.keys():`
			`data = model_part[name]`

			`# we don't need these`
			`if name == "rope.freqs":`
			`continue`

			`# convert any unsupported data types to float32`
			`if data.dtype != torch.float16 and data.dtype != torch.float32:`
			`data = data.to(torch.float32)`

			`data = data.squeeze().numpy()`

			`# map tensor names`
			`if name.endswith(".weight") and name[:-7] in tensor_map:`
			`name = tensor_map[name[:-7]] + ".weight"`
			`elif name.endswith(".bias") and name[:-5] in tensor_map:`
			`name = tensor_map[name[:-5]] + ".bias"`
			`else:`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`print("Can not map tensor '" + name + "'")`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00			`sys.exit()`

			`n_dims = len(data.shape)`
gguf : deduplicate (#2629) * gguf : better type names * dedup : CPU + Metal is working * ggml : fix warnings about unused results * llama.cpp : fix line feed and compiler warning * llama : fix strncpy warning + note token_to_str does not write null * llama : restore the original load/save session implementation Will migrate this to GGUF in the future * convert-llama-h5-to-gguf.py : support alt ctx param name * ggml : assert when using ggml_mul with non-F32 src1 * examples : dedup simple --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> 2023-08-16 16:25:29 +00:00			`data_dtype = data.dtype`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`old_dtype = data_dtype`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`# if f32 desired, convert any float16 to float32`
			`if ftype == 0 and data.dtype == np.float16:`
			`data_dtype = np.float32`

			`# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32`
			`if ftype == 1 and data_dtype == np.float16 and n_dims == 1:`
			`data_dtype = np.float32`

			`# if f16 desired, convert any float32 2-dim weight tensors to float16`
			`if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:`
			`data_dtype = np.float16`

gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype))`

			`data = data.astype(data_dtype)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`gguf_writer.add_tensor(name, data)`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00

			`print("gguf: write header")`
			`gguf_writer.write_header_to_file()`
			`print("gguf: write metadata")`
			`gguf_writer.write_kv_data_to_file()`
gguf : refactor pth to gguf conversion script 2023-08-17 16:58:27 +00:00			`print("gguf: write tensors")`
			`gguf_writer.write_tensors_to_file()`
Create convert-llama-7b-pth-to-gguf.py 2023-08-14 11:51:09 +00:00
			`gguf_writer.close()`


			`print("gguf: model successfully exported to '" + fname_out + "'")`
			`print("")`