From 5e67008f389608b4850fab761c2549a5ffba92ac Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 14:40:03 +0200 Subject: [PATCH] llama : add OuteTTS support (wip) --- convert_hf_to_gguf.py | 26 ++++++ examples/tts/convert_pt_to_hf.py | 141 +++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 65 ++++++++++++++ gguf-py/gguf/tensor_mapping.py | 83 ++++++++++++++++++ 4 files changed, 315 insertions(+) create mode 100644 examples/tts/convert_pt_to_hf.py diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9dc1673bc..2a9ed6a71 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -689,6 +689,9 @@ class Model: return res # Marker: End get_vocab_base_pre + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -2027,6 +2030,29 @@ class Qwen2VLModel(Model): yield name, data +@Model.register("OuteTTSVocoder") +class OuteTTSVocoderModel(Model): + model_arch = gguf.MODEL_ARCH.OUTETTS_VOC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py new file mode 100644 index 000000000..c77aee6a8 --- /dev/null +++ b/examples/tts/convert_pt_to_hf.py @@ -0,0 +1,141 @@ +# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format +# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder +# +# TODO: this script is LLM-generated and probably very inefficient and should be rewritten + +import torch +import json +import os +import sys +import re + +from safetensors.torch import save_file + +# change path to script dir +os.chdir(os.path.dirname(os.path.abspath(__file__))) + +# default +model_path = './model.pt'; + +# read from CLI +if len(sys.argv) > 1: + model_path = sys.argv[1] + +# get the directory of the input model +path_dst = os.path.dirname(model_path) + +print(f"Loading model from {model_path}") + +model = torch.load(model_path, map_location='cpu') + +#print(model) + +# print all keys +for key in model.keys(): + print(key) + if key == 'hyper_parameters': + #print(model[key]) + # dump as json pretty + print(json.dumps(model[key], indent=4)) + #if key != 'state_dict' and key != 'optimizer_states': + # print(model[key]) + +# Check if the loaded model is a state_dict or a model instance +if isinstance(model, torch.nn.Module): + state_dict = model.state_dict() +else: + state_dict = model + +# Print the structure of the state_dict to understand its format +print("State dictionary keys:") +for key in state_dict.keys(): + print(key) + +# Ensure the state_dict is flat and contains only torch.Tensor objects +def flatten_state_dict(state_dict, parent_key='', sep='.'): + items = [] + items_new = [] + + for k, v in state_dict.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, torch.Tensor): + items.append((new_key, v)) + elif isinstance(v, dict): + items.extend(flatten_state_dict(v, new_key, sep=sep).items()) + return dict(items) + + size_total_mb = 0 + + for key, value in list(items): + # keep only what we need for inference + if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \ + not key.startswith('state_dict.backbone.') and \ + not key.startswith('state_dict.head.'): + print('Skipping key: ', key) + continue + + new_key = key + + new_key = new_key.replace('state_dict.', '') + + # check if matches "backbone.pos_net.%d.bias" or "backbone.pos_net.%d.weight" + if new_key.startswith("backbone.pos_net."): + match = re.match(r"backbone\.pos_net\.(\d+)\.(bias|weight)", new_key) + if match: + new_key = f"backbone.pos_net.{match.group(1)}.norm.{match.group(2)}" + + size_mb = value.element_size() * value.nelement() / (1024 * 1024) + print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") + + size_total_mb += size_mb + + #print(key, '->', new_key, ': ', value) + #print(key, '->', new_key) + + items_new.append((new_key, value)) + + print(f"Total size: {size_total_mb:8.2f} MB") + + return dict(items_new) + +flattened_state_dict = flatten_state_dict(state_dict) + + +# Convert the model to the safetensors format +output_path = path_dst + '/model.safetensors' +save_file(flattened_state_dict, output_path) + +print(f"Model has been successfully converted and saved to {output_path}") + +# Calculate the total size of the .safetensors file +total_size = os.path.getsize(output_path) + +# Create the weight map +weight_map = { + "model.safetensors": ["*"] # Assuming all weights are in one file +} + +# Create metadata for the index.json file +metadata = { + "total_size": total_size, + "weight_map": weight_map +} + +# Save the metadata to index.json +index_path = path_dst + '/index.json' +with open(index_path, 'w') as f: + json.dump(metadata, f, indent=4) + +print(f"Metadata has been saved to {index_path}") + +config = { + "architectures": [ + "OuteTTSVocoder" + ], + "num_hidden_layers": 12 +} + +with open(path_dst + '/config.json', 'w') as f: + json.dump(config, f, indent=4) + +print(f"Config has been saved to {path_dst + 'config.json'}") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c2c7cad14..37d8bce47 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -261,11 +261,13 @@ class MODEL_ARCH(IntEnum): GRANITE = auto() GRANITE_MOE = auto() CHAMELEON = auto() + OUTETTS_VOC = auto() class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() + TOKEN_EMBD_SHIFT = auto() TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() @@ -370,6 +372,24 @@ class MODEL_TENSOR(IntEnum): ENC_OUTPUT_NORM = auto() CLS = auto() # classifier CLS_OUT = auto() # classifier output projection + CONV_NEXT_DW = auto() + CONV_NEXT_NORM = auto() + CONV_NEXT_SHIFT = auto() + CONV_NEXT_PW1 = auto() + CONV_NEXT_PW2 = auto() + CONV_NEXT_GAMMA = auto() + POS_NET_CONV1 = auto() + POS_NET_CONV2 = auto() + POS_NET_NORM = auto() + POS_NET_NORM1 = auto() + POS_NET_NORM2 = auto() + POS_NET_ATTN_NORM = auto() + POS_NET_ATTN_Q = auto() + POS_NET_ATTN_K = auto() + POS_NET_ATTN_V = auto() + POS_NET_ATTN_OUT = auto() + QNTZ_CBOOK_EMBD = auto() + HANN_WINDOW = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -425,11 +445,13 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.OUTETTS_VOC: "outetts-voc", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_EMBD_SHIFT: "token_embd_shift", MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -534,6 +556,24 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV_NEXT_DW: "conv_next.{bid}.dw", + MODEL_TENSOR.CONV_NEXT_NORM: "conv_next.{bid}.norm", + MODEL_TENSOR.CONV_NEXT_SHIFT: "conv_next.{bid}.shift", + MODEL_TENSOR.CONV_NEXT_PW1: "conv_next.{bid}.pw1", + MODEL_TENSOR.CONV_NEXT_PW2: "conv_next.{bid}.pw2", + MODEL_TENSOR.CONV_NEXT_GAMMA: "conv_next.{bid}.gamma", + MODEL_TENSOR.POS_NET_CONV1: "pos_net.{bid}.conv1", + MODEL_TENSOR.POS_NET_CONV2: "pos_net.{bid}.conv2", + MODEL_TENSOR.POS_NET_NORM: "pos_net.{bid}.norm", + MODEL_TENSOR.POS_NET_NORM1: "pos_net.{bid}.norm1", + MODEL_TENSOR.POS_NET_NORM2: "pos_net.{bid}.norm2", + MODEL_TENSOR.POS_NET_ATTN_NORM: "pos_net.{bid}.attn_norm", + MODEL_TENSOR.POS_NET_ATTN_Q: "pos_net.{bid}.attn_q", + MODEL_TENSOR.POS_NET_ATTN_K: "pos_net.{bid}.attn_k", + MODEL_TENSOR.POS_NET_ATTN_V: "pos_net.{bid}.attn_v", + MODEL_TENSOR.POS_NET_ATTN_OUT: "pos_net.{bid}.attn_output", + MODEL_TENSOR.QNTZ_CBOOK_EMBD: "qntz.cbook.{bid}.embd", + MODEL_TENSOR.HANN_WINDOW: "hann_window", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -1372,6 +1412,31 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OUTETTS_VOC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_EMBD_SHIFT, + MODEL_TENSOR.CONV_NEXT_DW, + MODEL_TENSOR.CONV_NEXT_NORM, + MODEL_TENSOR.CONV_NEXT_SHIFT, + MODEL_TENSOR.CONV_NEXT_PW1, + MODEL_TENSOR.CONV_NEXT_PW2, + MODEL_TENSOR.CONV_NEXT_GAMMA, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.POS_NET_CONV1, + MODEL_TENSOR.POS_NET_CONV2, + MODEL_TENSOR.POS_NET_NORM, + MODEL_TENSOR.POS_NET_NORM1, + MODEL_TENSOR.POS_NET_NORM2, + MODEL_TENSOR.POS_NET_ATTN_NORM, + MODEL_TENSOR.POS_NET_ATTN_Q, + MODEL_TENSOR.POS_NET_ATTN_K, + MODEL_TENSOR.POS_NET_ATTN_V, + MODEL_TENSOR.POS_NET_ATTN_OUT, + MODEL_TENSOR.QNTZ_CBOOK_EMBD, + MODEL_TENSOR.HANN_WINDOW, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 573d0282e..39eeea434 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -28,6 +28,7 @@ class TensorNameMap: "transformer.token_embeddings", # openelm "shared", # t5 "rwkv.embeddings", # rwkv + "backbone.embed", # outetts ), # Token type embeddings @@ -42,6 +43,11 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv + "backbone.norm.scale", # outetts + ), + + MODEL_TENSOR.TOKEN_EMBD_SHIFT: ( + "backbone.norm.shift", # outetts ), # Position embeddings @@ -60,6 +66,7 @@ class TensorNameMap: "lm_head.linear", # phi2 "output_layer", # chatglm "head", # rwkv + "head.out", # outetts ), # Output norm @@ -80,6 +87,7 @@ class TensorNameMap: "transformer.norm", # openelm "model.norm", # nemotron "rwkv.ln_out", # rwkv + "backbone.final_layer_norm", # outetts ), # Rope frequencies @@ -90,6 +98,10 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FACTORS_LONG: (), MODEL_TENSOR.ROPE_FACTORS_SHORT: (), + + MODEL_TENSOR.HANN_WINDOW: ( + "head.istft.window", # outetts + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -681,6 +693,8 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + ############################################################################ + # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 ), @@ -693,6 +707,75 @@ class TensorNameMap: MODEL_TENSOR.CLS_OUT: ( "classifier.out_proj", # roberta ), + ############################################################################# + + MODEL_TENSOR.CONV_NEXT_DW: ( + "backbone.convnext.{bid}.dwconv", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_NORM: ( + "backbone.convnext.{bid}.norm.scale", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_SHIFT: ( + "backbone.convnext.{bid}.norm.shift", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_PW1: ( + "backbone.convnext.{bid}.pwconv1", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_PW2: ( + "backbone.convnext.{bid}.pwconv2", # outetts + ), + + MODEL_TENSOR.CONV_NEXT_GAMMA: ( + "backbone.convnext.{bid}.gamma", # outetts + ), + + MODEL_TENSOR.POS_NET_CONV1: ( + "backbone.pos_net.{bid}.conv1", # outetts + ), + + MODEL_TENSOR.POS_NET_CONV2: ( + "backbone.pos_net.{bid}.conv2", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM: ( + "backbone.pos_net.{bid}.norm", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM1: ( + "backbone.pos_net.{bid}.norm1", # outetts + ), + + MODEL_TENSOR.POS_NET_NORM2: ( + "backbone.pos_net.{bid}.norm2", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_NORM: ( + "backbone.pos_net.{bid}.norm", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_Q: ( + "backbone.pos_net.{bid}.q", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_K: ( + "backbone.pos_net.{bid}.k", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_V: ( + "backbone.pos_net.{bid}.v", # outetts + ), + + MODEL_TENSOR.POS_NET_ATTN_OUT: ( + "backbone.pos_net.{bid}.proj_out", # outetts + ), + + MODEL_TENSOR.QNTZ_CBOOK_EMBD: ( + "feature_extractor.encodec.quantizer.vq.layers.{bid}._codebook.embed", # outetts + ), } # architecture-specific block mappings