llama.cpp/convert-llama-h5-to-gguf.py

278 lines
9.3 KiB
Python

# Quick and dirty HF llama --> gguf conversion, GQA/70b wont work
import gguf
import sys
import struct
import json
import numpy as np
from typing import Any, List
from pathlib import Path
from transformers import AutoModelForCausalLM
from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
def permute(weights: NDArray, n_head: int) -> NDArray:
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
print(" ftype == 0 -> float32")
print(" ftype == 1 -> float16")
sys.exit(1)
# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 2:
ftype = int(sys.argv[2])
if ftype < 0 or ftype > 1:
print("Invalid ftype: " + str(ftype))
sys.exit(1)
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "LlamaForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0] )
sys.exit()
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True)
list_vars = model.state_dict()
gguf_writer = gguf.GGUFWriter.open(fname_out)
print("gguf: add key-values, metadata")
llm_arch = "llama"
gguf_writer.add_name("llama2-7b")
gguf_writer.add_description("gguf test model")
gguf_writer.add_architecture(llm_arch)
gguf_writer.add_context_length(llm_arch, hparams["max_position_embeddings"])
gguf_writer.add_embedding_length(llm_arch, hparams["hidden_size"])
gguf_writer.add_layer_count(llm_arch, hparams["num_hidden_layers"])
gguf_writer.add_feed_forward_length(llm_arch, hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(llm_arch, hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(llm_arch, hparams["num_attention_heads"])
gguf_writer.add_layer_norm_rms_eps(llm_arch, hparams["rms_norm_eps"])
# TOKENIZATION
print("gguf: add key-values, tokenizer")
tokens: List[str] = []
scores: List[float] = []
if Path(dir_model + "/tokenizer.model").is_file():
# vocab type sentencepiece
print("gguf: adding sentencepiece tokenizer vocab")
tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
for i in range(tokenizer.vocab_size()):
text: bytes
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
if tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i)
if len(piece) != 6:
raise Exception(f"Invalid token: {piece}")
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
tokens.append(text)
scores.append(score)
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
if Path(dir_model + "/tokenizer.json").is_file():
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
tokenizer = json.load(f)
if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
print("gguf: adding special token ids")
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
tokenizer_config = json.load(f)
# find special token ids
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["bos_token"]["content"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["eos_token"]["content"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["unk_token"]["content"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]["content"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
for key in tokenizer["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]["content"]:
gguf_writer.add_pad_token_id(key["id"])
# TENSORS
# tensor info
print("gguf: add gguf tensor info")
for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
# permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
data = permute(data, hparams["num_attention_heads"])
# chnage tensor name
if name == "model.embed_tokens.weight":
name = "tok_embeddings.weight"
elif name == "model.norm.weight":
name = "norm.weight"
elif name == "lm_head.weight":
name = "output.weight"
else:
for i in range(80): # maximum number of layers
if name == "model.layers." + str(i) + ".input_layernorm.weight":
name = "layers." + str(i) + ".attention_norm.weight"
break
if name == "model.layers." + str(i) + ".self_attn.q_proj.weight":
name = "layers." + str(i) + ".attention.wq.weight"
break
if name == "model.layers." + str(i) + ".self_attn.k_proj.weight":
name = "layers." + str(i) + ".attention.wk.weight"
break
if name == "model.layers." + str(i) + ".self_attn.v_proj.weight":
name = "layers." + str(i) + ".attention.wv.weight"
break
if name == "model.layers." + str(i) + ".self_attn.o_proj.weight":
name = "layers." + str(i) + ".attention.wo.weight"
break
if name == "model.layers." + str(i) + ".post_attention_layernorm.weight":
name = "layers." + str(i) + ".ffn_norm.weight"
break
if name == "model.layers." + str(i) + ".mlp.gate_proj.weight":
name = "layers." + str(i) + ".feed_forward.w1.weight"
break
if name == "model.layers." + str(i) + ".mlp.down_proj.weight":
name = "layers." + str(i) + ".feed_forward.w2.weight"
break
if name == "model.layers." + str(i) + ".mlp.up_proj.weight":
name = "layers." + str(i) + ".feed_forward.w3.weight"
break
n_dims = len(data.shape)
# ftype == 0 -> float32, ftype == 1 -> float16
ftype_cur = 0
if ftype != 0:
if name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
ftype_cur = 1
else:
data = data.astype(np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
data = data.astype(np.float32)
ftype_cur = 0
gguf_writer.add_tensor_info(name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write key-values")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensor info")
gguf_writer.write_ti_data_to_file()
# tensor data
print("gguf: write tensor data")
for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()
# print("Process tensor: " + name + " with shape: ", data.shape)
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
# print(" Skip tensor: " + name)
continue
# permute these
if name.endswith(".q_proj.weight") or name.endswith(".k_proj.weight"):
# print(" Permute tensor: " + name)
data = permute(data, hparams["num_attention_heads"])
n_dims = len(data.shape)
# ftype == 0 -> float32, ftype == 1 -> float16
ftype_cur = 0
if ftype != 0:
if name.endswith(".weight") and n_dims == 2:
# print(" Converting to float16")
data = data.astype(np.float16)
ftype_cur = 1
else:
# print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
# print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
gguf_writer.write_tensor_to_file(data)
gguf_writer.close()
print("gguf: conversion done, output file: " + fname_out)
print("")