diff --git a/convert-llama-h5-to-gguf.py b/convert-llama-h5-to-gguf.py new file mode 100644 index 000000000..29daed3b3 --- /dev/null +++ b/convert-llama-h5-to-gguf.py @@ -0,0 +1,231 @@ +# Quick and dirty HF llama --> gguf conversion, GQA/70b wont work + +import gguf +import sys +import struct +import json +import numpy as np +from typing import List +from pathlib import Path +from transformers import AutoModelForCausalLM +from sentencepiece import SentencePieceProcessor + + +NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' +def permute(weights: NDArray, n_head: int) -> NDArray: + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +if len(sys.argv) < 3: + print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(" ftype == 0 -> float32") + print(" ftype == 1 -> float16") + sys.exit(1) + + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model.bin" + + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if len(sys.argv) > 2: + ftype = int(sys.argv[2]) + if ftype < 0 or ftype > 1: + print("Invalid ftype: " + str(ftype)) + sys.exit(1) + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" + + +model = AutoModelForCausalLM.from_pretrained( dir_model, low_cpu_mem_usage=True, trust_remote_code=True ) +list_vars = model.state_dict() + +# count tensors to be converted +tensor_count = 0 +for name in list_vars.keys(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + tensor_count += 1 + +#fout = open(fname_out, "wb") +gguf_writer = gguf.GGUFWriter.open(fname_out) + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# This mmust be changed when adding/deleting kv +kv_count = 13 + +print("tensors " + str(tensor_count) + " kv " + str(kv_count) ) + +print("write gguf header") + +gguf_writer.write_header(tensor_count, kv_count) + +print("write gguf hparams") + +llm_arch = "llama" + +gguf_writer.write_name("llama2-7b") +gguf_writer.write_description("gguf test model") +gguf_writer.write_architecture(llm_arch) +gguf_writer.write_context_length(llm_arch, hparams["max_position_embeddings"]) +gguf_writer.write_embedding_length(llm_arch, hparams["hidden_size"]) +gguf_writer.write_layer_count(llm_arch, hparams["num_hidden_layers"]) +gguf_writer.write_feed_forward_length(llm_arch, hparams["intermediate_size"]) +gguf_writer.write_rope_dimension_count(llm_arch, hparams["hidden_size"] // hparams["num_attention_heads"]) +gguf_writer.write_head_count(llm_arch, hparams["num_attention_heads"]) +gguf_writer.write_float32(llm_arch + ".attention.layer_norm_rms_epsilon", hparams["rms_norm_eps"]) + + +# TOKENIZATION + +tokens: List[str] = [] +scores: List[float] = [] + +if Path( dir_model + "/tokenizer.model").is_file(): + # vocab type SPIECE + print( "Adding sentencepiece tokenizer vocab." ) + tokenizer = SentencePieceProcessor( dir_model + "/tokenizer.model" ) + + # output vocab_size followed by all piece/score pairs + outbytes: bytes + outbytes = b"" + outbytes += struct.pack("I", tokenizer.vocab_size()) + + for i in range(tokenizer.vocab_size()): + text: bytes + if tokenizer.is_unknown(i): + text = " \u2047 ".encode("utf-8") + elif tokenizer.is_control(i): + text = b"" + if tokenizer.is_byte(i): + piece = tokenizer.id_to_piece(i) + if len(piece) != 6: + raise Exception(f"Invalid token: {piece}") + byte_value = int(piece[3:-1], 16) + text = struct.pack("B", byte_value) + else: + text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") + score: float = tokenizer.get_score(i) + + tokens.append( str(text) ); + scores.append( score ); + +print("write gguf tokens") + +gguf_writer.write_string("tokenizer.ggml.model", "llama") +gguf_writer.write_array("tokenizer.ggml.tokens",tokens) +gguf_writer.write_array("tokenizer.ggml.scores",scores) + +# TENSORS + + +# tensor info +print("write gguf tensor info") + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + # permute these + if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): + data = permute( data, hparams["num_attention_heads"] ) + + # chnage tensor name + + if name == "model.embed_tokens.weight": + name = "tok_embeddings.weight" + elif name == "model.norm.weight": + name = "norm.weight" + elif name == "lm_head.weight": + name = "output.weight" + else: + for i in range(80): # maximum number of layers + if name == "model.layers." + str(i) + ".input_layernorm.weight": + name = "layers." + str(i) + ".attention_norm.weight" + break + if name == "model.layers." + str(i) + ".self_attn.q_proj.weight": + name = "layers." + str(i) + ".attention.wq.weight" + break + if name == "model.layers." + str(i) + ".self_attn.k_proj.weight": + name = "layers." + str(i) + ".attention.wk.weight" + break + if name == "model.layers." + str(i) + ".self_attn.v_proj.weight": + name = "layers." + str(i) + ".attention.wv.weight" + break + if name == "model.layers." + str(i) + ".self_attn.o_proj.weight": + name = "layers." + str(i) + ".attention.wo.weight" + break + if name == "model.layers." + str(i) + ".post_attention_layernorm.weight": + name = "layers." + str(i) + ".ffn_norm.weight" + break + if name == "model.layers." + str(i) + ".mlp.gate_proj.weight": + name = "layers." + str(i) + ".feed_forward.w1.weight" + break + if name == "model.layers." + str(i) + ".mlp.down_proj.weight": + name = "layers." + str(i) + ".feed_forward.w2.weight" + break + if name == "model.layers." + str(i) + ".mlp.up_proj.weight": + name = "layers." + str(i) + ".feed_forward.w3.weight" + break + + gguf_writer.write_tensor_info(name, data) + + +# tensor data +print("write gguf tensor data") + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Process tensor: " + name + " with shape: ", data.shape) + + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + print(" Skip tensor: " + name) + continue + + ## permute these + if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"): + print(" Permute tensor: " + name) + data = permute( data, hparams["num_attention_heads"] ) + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if ftype != 0: + if name.endswith(".weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + gguf_writer.write_tensor_padding() + gguf_writer.write_tensor(data) + +gguf_writer.close() + + +print("Done. Output file: " + fname_out) +print("")