llama.cpp/examples/tts/convert_pt_to_hf.py

# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
#
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten

import torch
import json
import os
import sys
import re

from safetensors.torch import save_file

# default
model_path = './model.pt';

# read from CLI
if len(sys.argv) > 1:
    model_path = sys.argv[1]

# get the directory of the input model
path_dst = os.path.dirname(model_path)

print(f"Loading model from {model_path}")

model = torch.load(model_path, map_location='cpu')

#print(model)

# print all keys
for key in model.keys():
    print(key)
    if key == 'hyper_parameters':
        #print(model[key])
        # dump as json pretty
        print(json.dumps(model[key], indent=4))
    #if key != 'state_dict' and key != 'optimizer_states':
    #    print(model[key])

# Check if the loaded model is a state_dict or a model instance
if isinstance(model, torch.nn.Module):
    state_dict = model.state_dict()
else:
    state_dict = model

# Print the structure of the state_dict to understand its format
print("State dictionary keys:")
for key in state_dict.keys():
    print(key)

# Ensure the state_dict is flat and contains only torch.Tensor objects
def flatten_state_dict(state_dict, parent_key='', sep='.'):
    items = []
    items_new = []

    for k, v in state_dict.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, torch.Tensor):
            items.append((new_key, v))
        elif isinstance(v, dict):
            items.extend(flatten_state_dict(v, new_key, sep=sep).items())
            return dict(items)

    size_total_mb = 0

    for key, value in list(items):
        # keep only what we need for inference
        if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
           not key.startswith('state_dict.backbone.') and \
           not key.startswith('state_dict.head.out'):
               print('Skipping key: ', key)
               continue

        new_key = key

        new_key = new_key.replace('state_dict.', '')
        new_key = new_key.replace('pos_net', 'posnet')

        # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"
        if new_key.startswith("backbone.posnet."):
            match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key)
            if match:
               new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"

        # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"
        if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":
            new_key = "backbone.embedding.weight"

        # these are the only rows used
        # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100
        if new_key.endswith("norm.scale.weight"):
            new_key = new_key.replace("norm.scale.weight", "norm.weight")
            value = value[0]

        if new_key.endswith("norm.shift.weight"):
            new_key = new_key.replace("norm.shift.weight", "norm.bias")
            value = value[0]

        if new_key.endswith("gamma"):
            new_key = new_key.replace("gamma", "gamma.weight")

        # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias
        if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):
            value = value.unsqueeze(1)

        if new_key.endswith("dwconv.bias"):
            value = value.unsqueeze(1)

        size_mb = value.element_size() * value.nelement() / (1024 * 1024)
        print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")

        size_total_mb += size_mb

        #print(key, '->', new_key, ': ', value)
        #print(key, '->', new_key)

        items_new.append((new_key, value))

    print(f"Total size: {size_total_mb:8.2f} MB")

    return dict(items_new)

flattened_state_dict = flatten_state_dict(state_dict)


# Convert the model to the safetensors format
output_path = path_dst + '/model.safetensors'
save_file(flattened_state_dict, output_path)

print(f"Model has been successfully converted and saved to {output_path}")

# Calculate the total size of the .safetensors file
total_size = os.path.getsize(output_path)

# Create the weight map
weight_map = {
    "model.safetensors": ["*"]  # Assuming all weights are in one file
}

# Create metadata for the index.json file
metadata = {
    "total_size": total_size,
    "weight_map": weight_map
}

# Save the metadata to index.json
index_path = path_dst + '/index.json'
with open(index_path, 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"Metadata has been saved to {index_path}")

config = {
    "architectures": [
        "WavTokenizerDec"
    ],
    "hidden_size": 1282,
    "n_embd_features": 512,
    "n_ff": 2304,
    "vocab_size": 4096,
    "n_head": 1,
    "layer_norm_epsilon": 1e-6,
    "group_norm_epsilon": 1e-6,
    "group_norm_groups": 32,
    "max_position_embeddings": 8192, # ?
    "n_layer": 12,
    "posnet": {
        "n_embd": 768,
        "n_layer": 6
    },
    "convnext": {
        "n_embd": 768,
        "n_layer": 12
    },
}

with open(path_dst + '/config.json', 'w') as f:
    json.dump(config, f, indent=4)

print(f"Config has been saved to {path_dst + 'config.json'}")
tts : add OuteTTS support (#10784) * server : add "tokens" output ggml-ci * server : output embeddings for all tokens when pooling = none ggml-ci * server : be explicit about the pooling type in the tests ggml-ci * server : do not normalize embeddings when there is no pooling ggml-ci * llama : add OuteTTS support (wip) * wip * extract features * first conv * group norm * resnet conv * resnet * attn * pos net * layer norm * convnext * head * hann window * fix n_embd + remove llama.cpp hacks * compute hann window * fft * spectrum processing * clean-up * tts : receive input text and generate codes * clip : fix new conv name * tts : minor fix * tts : add header + minor fixes ggml-ci * tts : add matchematical constant ggml-ci * tts : fix sampling + cut initial noise * tts : fixes * tts : update default samplers ggml-ci * tts : text pre-processing * tts : outetts-voc -> wavtokenizer-dec * tts : remove hardcoded constants ggml-ci * tts : fix tensor shapes * llama : refactor wavtokenizer tensors ggml-ci * cont ggml-ci * cont [no ci] * llama : update WavTokenizer to non-causal attn * llama : handle no-vocab detokenization * tts : add Python example for OuteTTS (wip) * tts : extend python example to generate spectrogram ggml-ci * server : fix rebase artifacts * tts : enable "return_tokens" in Python example ggml-ci * tts : minor fixes * common : support HF download for vocoder 2024-12-18 17:27:21 +00:00			`# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format`
			`# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder`
			`#`
			`# TODO: this script is LLM-generated and probably very inefficient and should be rewritten`

			`import torch`
			`import json`
			`import os`
			`import sys`
			`import re`

			`from safetensors.torch import save_file`

			`# default`
			`model_path = './model.pt';`

			`# read from CLI`
			`if len(sys.argv) > 1:`
			`model_path = sys.argv[1]`

			`# get the directory of the input model`
			`path_dst = os.path.dirname(model_path)`

			`print(f"Loading model from {model_path}")`

			`model = torch.load(model_path, map_location='cpu')`

			`#print(model)`

			`# print all keys`
			`for key in model.keys():`
			`print(key)`
			`if key == 'hyper_parameters':`
			`#print(model[key])`
			`# dump as json pretty`
			`print(json.dumps(model[key], indent=4))`
			`#if key != 'state_dict' and key != 'optimizer_states':`
			`# print(model[key])`

			`# Check if the loaded model is a state_dict or a model instance`
			`if isinstance(model, torch.nn.Module):`
			`state_dict = model.state_dict()`
			`else:`
			`state_dict = model`

			`# Print the structure of the state_dict to understand its format`
			`print("State dictionary keys:")`
			`for key in state_dict.keys():`
			`print(key)`

			`# Ensure the state_dict is flat and contains only torch.Tensor objects`
			`def flatten_state_dict(state_dict, parent_key='', sep='.'):`
			`items = []`
			`items_new = []`

			`for k, v in state_dict.items():`
			`new_key = f"{parent_key}{sep}{k}" if parent_key else k`
			`if isinstance(v, torch.Tensor):`
			`items.append((new_key, v))`
			`elif isinstance(v, dict):`
			`items.extend(flatten_state_dict(v, new_key, sep=sep).items())`
			`return dict(items)`

			`size_total_mb = 0`

			`for key, value in list(items):`
			`# keep only what we need for inference`
			`if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \`
			`not key.startswith('state_dict.backbone.') and \`
			`not key.startswith('state_dict.head.out'):`
			`print('Skipping key: ', key)`
			`continue`

			`new_key = key`

			`new_key = new_key.replace('state_dict.', '')`
			`new_key = new_key.replace('pos_net', 'posnet')`

			`# check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight"`
			`if new_key.startswith("backbone.posnet."):`
			`match = re.match(r"backbone\.posnet\.(\d+)\.(bias\|weight)", new_key)`
			`if match:`
			`new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}"`

			`# "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight"`
			`if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed":`
			`new_key = "backbone.embedding.weight"`

			`# these are the only rows used`
			`# ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100`
			`if new_key.endswith("norm.scale.weight"):`
			`new_key = new_key.replace("norm.scale.weight", "norm.weight")`
			`value = value[0]`

			`if new_key.endswith("norm.shift.weight"):`
			`new_key = new_key.replace("norm.shift.weight", "norm.bias")`
			`value = value[0]`

			`if new_key.endswith("gamma"):`
			`new_key = new_key.replace("gamma", "gamma.weight")`

			`# convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias`
			`if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")):`
			`value = value.unsqueeze(1)`

			`if new_key.endswith("dwconv.bias"):`
			`value = value.unsqueeze(1)`

			`size_mb = value.element_size() * value.nelement() / (1024 * 1024)`
			`print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")`

			`size_total_mb += size_mb`

			`#print(key, '->', new_key, ': ', value)`
			`#print(key, '->', new_key)`

			`items_new.append((new_key, value))`

			`print(f"Total size: {size_total_mb:8.2f} MB")`

			`return dict(items_new)`

			`flattened_state_dict = flatten_state_dict(state_dict)`


			`# Convert the model to the safetensors format`
			`output_path = path_dst + '/model.safetensors'`
			`save_file(flattened_state_dict, output_path)`

			`print(f"Model has been successfully converted and saved to {output_path}")`

			`# Calculate the total size of the .safetensors file`
			`total_size = os.path.getsize(output_path)`

			`# Create the weight map`
			`weight_map = {`
			`"model.safetensors": ["*"] # Assuming all weights are in one file`
			`}`

			`# Create metadata for the index.json file`
			`metadata = {`
			`"total_size": total_size,`
			`"weight_map": weight_map`
			`}`

			`# Save the metadata to index.json`
			`index_path = path_dst + '/index.json'`
			`with open(index_path, 'w') as f:`
			`json.dump(metadata, f, indent=4)`

			`print(f"Metadata has been saved to {index_path}")`

			`config = {`
			`"architectures": [`
			`"WavTokenizerDec"`
			`],`
			`"hidden_size": 1282,`
			`"n_embd_features": 512,`
			`"n_ff": 2304,`
			`"vocab_size": 4096,`
			`"n_head": 1,`
			`"layer_norm_epsilon": 1e-6,`
			`"group_norm_epsilon": 1e-6,`
			`"group_norm_groups": 32,`
			`"max_position_embeddings": 8192, # ?`
			`"n_layer": 12,`
			`"posnet": {`
			`"n_embd": 768,`
			`"n_layer": 6`
			`},`
			`"convnext": {`
			`"n_embd": 768,`
			`"n_layer": 12`
			`},`
			`}`

			`with open(path_dst + '/config.json', 'w') as f:`
			`json.dump(config, f, indent=4)`

			`print(f"Config has been saved to {path_dst + 'config.json'}")`