# 7b pth llama --> gguf conversion, GQA/70b not supported # Only models with a single datafile are supported, like 7B # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model import gguf import os import sys import struct import json import numpy as np import torch from typing import Any, List from pathlib import Path from sentencepiece import SentencePieceProcessor #NDArray = np.ndarray[Any, Any] # compatible with python < 3.9 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' def count_model_parts(dir_model: str) -> int: num_parts = 0 for filename in os.listdir(dir_model): if filename.startswith("consolidated."): num_parts += 1 if num_parts > 0: print("gguf: found " + str(num_parts) + " model parts") return num_parts if len(sys.argv) < 3: print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print(" ftype == 0 -> float32") print(" ftype == 1 -> float16") sys.exit(1) # output in the same directory as the model dir_model = sys.argv[1] last_dir = os.path.basename(os.path.normpath(dir_model)) # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 # map from ftype to string ftype_str = ["f32", "f16"] ftype = 1 if len(sys.argv) > 2: ftype = int(sys.argv[2]) if ftype < 0 or ftype > 1: print("Invalid ftype: " + str(ftype)) sys.exit(1) fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" print("gguf: loading model "+last_dir) with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if hparams["architectures"][0] != "LlamaForCausalLM": print("Model architecture not supported: " + hparams["architectures"][0]) sys.exit() # get number of model parts num_parts = count_model_parts(dir_model) if num_parts > 1: print("gguf: Only models with a single datafile are supported.") sys.exit() llm_arch = "llama" gguf_writer = gguf.GGUFWriter(fname_out, arch=llm_arch) print("gguf: get model metadata") block_count = hparams["num_hidden_layers"] head_count = hparams["num_attention_heads"] if "num_key_value_heads" in hparams: head_count_kv = hparams["num_key_value_heads"] else: head_count_kv = head_count if "_name_or_path" in hparams: hf_repo = hparams["_name_or_path"] else: hf_repo = "" gguf_writer.add_architecture() gguf_writer.add_name(last_dir) gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_tensor_data_layout("Meta AI original pth") gguf_writer.add_context_length(hparams["max_position_embeddings"]) gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_block_count(block_count) gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) gguf_writer.add_head_count(head_count) gguf_writer.add_head_count_kv(head_count_kv) gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) # TOKENIZATION print("gguf: get tokenizer metadata") tokens: List[bytes] = [] scores: List[float] = [] toktypes: List[int] = [] if Path(dir_model + "/tokenizer.model").is_file(): # vocab type sentencepiece print("gguf: get sentencepiece tokenizer vocab and scores") tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model") for i in range(tokenizer.vocab_size()): text: bytes score: float piece = tokenizer.id_to_piece(i) text = piece.encode("utf-8") score = tokenizer.get_score(i) toktype = 1 # defualt to normal token type if tokenizer.is_unknown(i): toktype = 2 if tokenizer.is_control(i): toktype = 3 # TODO: How to determinate if a token is user defined? # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = 4 if tokenizer.is_unused(i): toktype = 5 if tokenizer.is_byte(i): toktype = 6 tokens.append(text) scores.append(score) toktypes.append(toktype) gguf_writer.add_tokenizer_model("llama") gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) if Path(dir_model + "/tokenizer.json").is_file(): with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: tokenizer = json.load(f) if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): print("gguf: get special token ids") with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: tokenizer_config = json.load(f) # find special token ids if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: for key in tokenizer["added_tokens"]: if key["content"] == tokenizer_config["bos_token"]["content"]: gguf_writer.add_bos_token_id(key["id"]) if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: for key in tokenizer["added_tokens"]: if key["content"] == tokenizer_config["eos_token"]["content"]: gguf_writer.add_eos_token_id(key["id"]) if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: for key in tokenizer["added_tokens"]: if key["content"] == tokenizer_config["unk_token"]["content"]: gguf_writer.add_unk_token_id(key["id"]) if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: for key in tokenizer["added_tokens"]: if key["content"] == tokenizer_config["sep_token"]["content"]: gguf_writer.add_sep_token_id(key["id"]) if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: for key in tokenizer["added_tokens"]: if key["content"] == tokenizer_config["pad_token"]["content"]: gguf_writer.add_pad_token_id(key["id"]) # TENSORS tensor_map = gguf.get_tensor_name_map(block_count) # tensor info print("gguf: get tensor metadata") part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) for part_name in part_names: print("gguf: loading model part '" + part_name + "'") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") for name in model_part.keys(): data = model_part[name] # we don't need these if name == "rope.freqs": continue # convert any unsupported data types to float32 if data.dtype != torch.float16 and data.dtype != torch.float32: data = data.to(torch.float32) data = data.squeeze().numpy() # map tensor names if name.endswith(".weight") and name[:-7] in tensor_map: name = tensor_map[name[:-7]] + ".weight" elif name.endswith(".bias") and name[:-5] in tensor_map: name = tensor_map[name[:-5]] + ".bias" else: print("Can not map tensor '" + name + "'") sys.exit() n_dims = len(data.shape) data_dtype = data.dtype old_dtype = data_dtype # if f32 desired, convert any float16 to float32 if ftype == 0 and data.dtype == np.float16: data_dtype = np.float32 # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 if ftype == 1 and data_dtype == np.float16 and n_dims == 1: data_dtype = np.float32 # if f16 desired, convert any float32 2-dim weight tensors to float16 if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data_dtype = np.float16 print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data_dtype)) data = data.astype(data_dtype) gguf_writer.add_tensor(name, data) print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") gguf_writer.write_kv_data_to_file() print("gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() print("gguf: model successfully exported to '" + fname_out + "'") print("")