gguf.py : merge all files in gguf.py

This commit is contained in:
Georgi Gerganov 2023-08-16 19:55:49 +03:00
parent 88b5769487
commit c8ee87f141
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
6 changed files with 213 additions and 207 deletions

View File

@ -1,50 +0,0 @@
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_ALIGNMENT = "general.alignment"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"
# tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"

View File

@ -1,15 +1,15 @@
# HF gptneox--> gguf conversion
import gguf
import gguf_namemap as tmap
import os
import sys
import struct
import json
import numpy as np
import torch
from typing import Any, List
from pathlib import Path
import torch
from transformers import AutoTokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@ -188,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS
tensor_map = tmap.get_tensor_namemap(block_count)
tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info
print("gguf: get tensor metadata")
@ -227,7 +227,7 @@ for part_name in part_names:
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
@ -292,7 +292,7 @@ for part_name in part_names:
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:

View File

@ -3,18 +3,17 @@
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
import gguf
import gguf_namemap as tmap
import os
import sys
import struct
import json
import numpy as np
import torch
from typing import Any, List
from pathlib import Path
from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -189,7 +188,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS
tensor_map = tmap.get_tensor_namemap(block_count)
tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info
print("gguf: get tensor metadata")

View File

@ -1,8 +1,6 @@
# HF llama --> gguf conversion
import gguf
import gguf_namemap as tmap
import os
import sys
import struct
@ -201,7 +199,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
# TENSORS
tensor_map = tmap.get_tensor_namemap(block_count)
tensor_map = gguf.get_tensor_name_map(block_count)
# tensor info
print("gguf: get tensor metadata")

256
gguf.py
View File

@ -4,14 +4,169 @@
3. After development is done, Convert it to a proper pip-installable Python package, and possibly move it to its own repo under ggml-org.
"""
import sys
import struct
import constants
import numpy as np
from enum import IntEnum
from typing import Any, IO, List
import numpy as np
import sys
#
# constants
#
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1
GGUF_DEFAULT_ALIGNMENT = 32
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_ALIGNMENT = "general.alignment"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{llm}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"
# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"
KEY_ATTENTION_LAYERNORM_EPS = "{llm}.attention.layer_norm_epsilon"
KEY_ATTENTION_LAYERNORM_RMS_EPS = "{llm}.attention.layer_norm_rms_epsilon"
# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"
# tokenization
KEY_TOKENIZER_MODEL = "tokenizer.ggml.model"
KEY_TOKENIZER_LIST = "tokenizer.ggml.tokens"
KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
KEY_TOKENIZER_SCORES = "tokenizer.ggml.scores"
KEY_TOKENIZER_MERGES = "tokenizer.ggml.merges"
KEY_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id"
KEY_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id"
KEY_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id"
KEY_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id"
KEY_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id"
KEY_TOKENIZER_HF_JSON = "tokenizer.huggingface.json"
KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
#
# recommended mapping of model tensor names for storage in gguf
#
def get_tensor_name_map(n_blocks : int):
tensor_map = {}
# Token embeddings
mapped_to = "token_embd"
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings
mapped_to = "pos_embd"
tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output norm
mapped_to = "output_norm"
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth
# Output
mapped_to = "output"
tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks
for i in range(0,n_blocks):
# Attention norm
mapped_to = "blk."+str(i)+".attn_norm"
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2"
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
# Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv"
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query
mapped_to = "blk."+str(i)+".attn_q"
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key
mapped_to = "blk."+str(i)+".attn_k"
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value
mapped_to = "blk."+str(i)+".attn_v"
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output
mapped_to = "blk."+str(i)+".attn_output"
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm"
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate"
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map
#
# implementation
#
class GGMLQuantizationType(IntEnum):
F32 = 0
@ -19,16 +174,16 @@ class GGMLQuantizationType(IntEnum):
class GGUFValueType(IntEnum):
UINT8 = 0
INT8 = 1
UINT16 = 2
INT16 = 3
UINT32 = 4
INT32 = 5
UINT8 = 0
INT8 = 1
UINT16 = 2
INT16 = 3
UINT32 = 4
INT32 = 5
FLOAT32 = 6
BOOL = 7
STRING = 8
ARRAY = 9
BOOL = 7
STRING = 8
ARRAY = 9
@staticmethod
def get_type(val):
@ -51,15 +206,15 @@ class GGUFWriter:
def __init__(self, fout: IO):
self.fout = fout
self.offset_tensor = 0
self.data_alignment = constants.GGUF_DEFAULT_ALIGNMENT
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.kv_data = b""
self.kv_data_count = 0
self.ti_data = b""
self.ti_data_count = 0
def write_header_to_file(self):
self.fout.write(struct.pack("<I", constants.GGUF_MAGIC))
self.fout.write(struct.pack("<I", constants.GGUF_VERSION))
self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<I", self.ti_data_count))
self.fout.write(struct.pack("<I", self.kv_data_count))
self.flush()
@ -201,126 +356,125 @@ class GGUFWriter:
self.fout.close()
def add_architecture(self, architecture: str):
self.add_string(constants.KEY_GENERAL_ARCHITECTURE,
self.add_string(KEY_GENERAL_ARCHITECTURE,
architecture)
def add_author(self, author: str):
self.add_string(constants.KEY_GENERAL_AUTHOR, author)
self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str):
self.add_string(constants.KEY_LLM_TENSOR_DATA_LAYOUT , layout)
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT , layout)
def add_url(self, url: str):
self.add_string(constants.KEY_GENERAL_URL, url)
self.add_string(KEY_GENERAL_URL, url)
def add_description(self, description: str):
self.add_string(constants.KEY_GENERAL_DESCRIPTION, description)
self.add_string(KEY_GENERAL_DESCRIPTION, description)
def add_file_type(self, file_type: str):
self.add_string(constants.KEY_GENERAL_FILE_TYPE, file_type)
self.add_string(KEY_GENERAL_FILE_TYPE, file_type)
def add_source_url(self, url: str):
self.add_string(constants.KEY_GENERAL_SOURCE_URL, url)
self.add_string(KEY_GENERAL_SOURCE_URL, url)
def add_source_hf_repo(self, repo: str):
self.add_string(constants.KEY_GENERAL_SOURCE_HF_REPO, repo)
self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
def add_name(self, name: str):
self.add_string(constants.KEY_GENERAL_NAME, name)
self.add_string(KEY_GENERAL_NAME, name)
def add_quantization_version(self, quantization_version: GGMLQuantizationType):
self.add_uint32(
constants.KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
def add_custom_alignment(self, alignment: int):
self.data_alignment = alignment
self.add_uint32(constants.KEY_GENERAL_ALIGNMENT, alignment)
self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
def add_context_length(self, llm: str, length: int):
self.add_uint32(
constants.KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
KEY_LLM_CONTEXT_LENGTH.format(llm=llm), length)
def add_embedding_length(self, llm: str, length: int):
self.add_uint32(
constants.KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
KEY_LLM_EMBEDDING_LENGTH.format(llm=llm), length)
def add_block_count(self, llm: str, length: int):
self.add_uint32(
constants.KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
KEY_LLM_BLOCK_COUNT.format(llm=llm), length)
def add_feed_forward_length(self, llm: str, length: int):
self.add_uint32(
constants.KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
KEY_LLM_FEED_FORWARD_LENGTH.format(llm=llm), length)
def add_parallel_residual(self, llm: str, use: bool):
self.add_bool(
constants.KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
KEY_LLM_USE_PARALLEL_RESIDUAL.format(llm=llm), use)
def add_tensor_data_layout(self, llm: str, layout: str):
self.add_string(
constants.KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
KEY_LLM_TENSOR_DATA_LAYOUT.format(llm=llm), layout)
def add_head_count(self, llm: str, count: int):
self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
KEY_ATTENTION_HEAD_COUNT.format(llm=llm), count)
def add_head_count_kv(self, llm: str, count: int):
self.add_uint32(
constants.KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
KEY_ATTENTION_HEAD_COUNT_KV.format(llm=llm), count)
def add_max_alibi_bias(self, llm: str, bias: float):
self.add_float32(
constants.KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
KEY_ATTENTION_MAX_ALIBI_BIAS.format(llm=llm), bias)
def add_clamp_kqv(self, llm: str, value: float):
self.add_float32(
constants.KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
KEY_ATTENTION_CLAMP_KQV.format(llm=llm), value)
def add_layer_norm_eps(self, llm: str, value: float):
self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
KEY_ATTENTION_LAYERNORM_EPS.format(llm=llm), value)
def add_layer_norm_rms_eps(self, llm: str, value: float):
self.add_float32(
constants.KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
KEY_ATTENTION_LAYERNORM_RMS_EPS.format(llm=llm), value)
def add_rope_dimension_count(self, llm: str, count: int):
self.add_uint32(
constants.KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
KEY_ROPE_DIMENSION_COUNT.format(llm=llm), count)
def add_rope_scale(self, llm: str, value: float):
self.add_float32(constants.KEY_ROPE_SCALE.format(llm=llm), value)
self.add_float32(KEY_ROPE_SCALE.format(llm=llm), value)
def add_tokenizer_model(self, model: str):
self.add_string(constants.KEY_TOKENIZER_MODEL, model)
self.add_string(KEY_TOKENIZER_MODEL, model)
def add_token_list(self, tokens: List):
self.add_array(constants.KEY_TOKENIZER_LIST, tokens)
self.add_array(KEY_TOKENIZER_LIST, tokens)
def add_token_merges(self, merges: List):
self.add_array(constants.KEY_TOKENIZER_MERGES, merges)
self.add_array(KEY_TOKENIZER_MERGES, merges)
def add_token_types(self, types: List[int]):
self.add_array(constants.KEY_TOKENIZER_TOKEN_TYPE, types)
self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
def add_token_scores(self, scores: List[float]):
self.add_array(constants.KEY_TOKENIZER_SCORES, scores)
self.add_array(KEY_TOKENIZER_SCORES, scores)
def add_bos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_BOS_ID, id)
self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
def add_eos_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_EOS_ID, id)
self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
def add_unk_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_UNK_ID, id)
self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
def add_sep_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_SEP_ID, id)
self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
def add_pad_token_id(self, id: int):
self.add_uint32(constants.KEY_TOKENIZER_PAD_ID, id)
self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
# Example usage:
if __name__ == "__main__":

View File

@ -1,95 +0,0 @@
# Recommended mapping of model tensor names for storage in gguf
def get_tensor_namemap( n_blocks : int):
tensor_map = {}
# Token embeddings
mapped_to = "token_embd"
tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox
tensor_map["transformer.wte"] = mapped_to # gpt2 mpt
tensor_map["transformer.word_embeddings"] = mapped_to # falcon
tensor_map["model.embed_tokens"] = mapped_to # llama-hf
tensor_map["tok_embeddings"] = mapped_to # llama-pth
# Position embeddings
mapped_to = "pos_embd"
tensor_map["transformer.wpe"] = mapped_to # gpt2
# Output norm
mapped_to = "output_norm"
tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox
tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon
tensor_map["transformer.norm_f"] = mapped_to # mpt
tensor_map["model.norm"] = mapped_to # llama-hf
tensor_map["norm"] = mapped_to # llama-pth
# Output
mapped_to = "output"
tensor_map["embed_out"] = mapped_to # gptneox
tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf
tensor_map["output"] = mapped_to # llama-pth
# Attention and fee-forward layer blocks
for i in range(0,n_blocks):
# Attention norm
mapped_to = "blk."+str(i)+".attn_norm"
tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b
tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b
tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth
# Attention norm 2
mapped_to = "blk."+str(i)+".attn_norm_2"
tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b
# Attention query-key-value
mapped_to = "blk."+str(i)+".attn_qkv"
tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon
# Attention query
mapped_to = "blk."+str(i)+".attn_q"
tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth
# Attention key
mapped_to = "blk."+str(i)+".attn_k"
tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth
# Attention value
mapped_to = "blk."+str(i)+".attn_v"
tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth
# Attention output
mapped_to = "blk."+str(i)+".attn_output"
tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth
# Feed-forward norm
mapped_to = "blk."+str(i)+".ffn_norm"
tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt
tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth
# Feed-forward up
mapped_to = "blk."+str(i)+".ffn_up"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth
# Feed-forward gate
mapped_to = "blk."+str(i)+".ffn_gate"
tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth
# Feed-forward down
mapped_to = "blk."+str(i)+".ffn_down"
tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox
tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2
tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt
tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon
tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf
tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth
return tensor_map