tts : outetts-voc -> wavtokenizer-dec
Some checks failed
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled

This commit is contained in:
Georgi Gerganov 2024-12-16 13:51:09 +02:00
parent f1b5b6b5a1
commit 985d59f5e5
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
5 changed files with 198 additions and 198 deletions

View File

@ -2032,9 +2032,9 @@ class Qwen2VLModel(Model):
yield name, data
@Model.register("OuteTTSVocoder")
class OuteTTSVocoderModel(Model):
model_arch = gguf.MODEL_ARCH.OUTETTS_VOC
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

View File

@ -1,5 +1,5 @@
# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
#
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten
@ -144,7 +144,7 @@ print(f"Metadata has been saved to {index_path}")
config = {
"architectures": [
"OuteTTSVocoder"
"WavTokenizerDec"
],
"hidden_size": 1282,
"vocab_size": 4096,

View File

@ -261,7 +261,7 @@ class MODEL_ARCH(IntEnum):
GRANITE = auto()
GRANITE_MOE = auto()
CHAMELEON = auto()
OUTETTS_VOC = auto()
WAVTOKENIZER_DEC = auto()
class MODEL_TENSOR(IntEnum):
@ -442,7 +442,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.GRANITE: "granite",
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.OUTETTS_VOC: "outetts-voc",
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -1406,7 +1406,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OUTETTS_VOC: [
MODEL_ARCH.WAVTOKENIZER_DEC: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.CONV1D,

View File

@ -42,7 +42,7 @@ class TensorNameMap:
"emb_ln", # nomic-bert
"transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv
"backbone.norm", # outetts
"backbone.norm", # wavtokenizer
),
# Position embeddings
@ -61,7 +61,7 @@ class TensorNameMap:
"lm_head.linear", # phi2
"output_layer", # chatglm
"head", # rwkv
"head.out", # outetts
"head.out", # wavtokenizer
),
# Output norm
@ -82,7 +82,7 @@ class TensorNameMap:
"transformer.norm", # openelm
"model.norm", # nemotron
"rwkv.ln_out", # rwkv
"backbone.final_layer_norm", # outetts
"backbone.final_layer_norm", # wavtokenizer
),
# Rope frequencies
@ -705,63 +705,63 @@ class TensorNameMap:
#############################################################################
MODEL_TENSOR.CONV_NEXT_DW: (
"backbone.convnext.{bid}.dwconv", # outetts
"backbone.convnext.{bid}.dwconv", # wavtokenizer
),
MODEL_TENSOR.CONV_NEXT_NORM: (
"backbone.convnext.{bid}.norm", # outetts
"backbone.convnext.{bid}.norm", # wavtokenizer
),
MODEL_TENSOR.CONV_NEXT_PW1: (
"backbone.convnext.{bid}.pwconv1", # outetts
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
),
MODEL_TENSOR.CONV_NEXT_PW2: (
"backbone.convnext.{bid}.pwconv2", # outetts
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
),
MODEL_TENSOR.CONV_NEXT_GAMMA: (
"backbone.convnext.{bid}.gamma", # outetts
"backbone.convnext.{bid}.gamma", # wavtokenizer
),
MODEL_TENSOR.POS_NET_CONV1: (
"backbone.pos_net.{bid}.conv1", # outetts
"backbone.pos_net.{bid}.conv1", # wavtokenizer
),
MODEL_TENSOR.POS_NET_CONV2: (
"backbone.pos_net.{bid}.conv2", # outetts
"backbone.pos_net.{bid}.conv2", # wavtokenizer
),
MODEL_TENSOR.POS_NET_NORM: (
"backbone.pos_net.{bid}.norm", # outetts
"backbone.pos_net.{bid}.norm", # wavtokenizer
),
MODEL_TENSOR.POS_NET_NORM1: (
"backbone.pos_net.{bid}.norm1", # outetts
"backbone.pos_net.{bid}.norm1", # wavtokenizer
),
MODEL_TENSOR.POS_NET_NORM2: (
"backbone.pos_net.{bid}.norm2", # outetts
"backbone.pos_net.{bid}.norm2", # wavtokenizer
),
MODEL_TENSOR.POS_NET_ATTN_NORM: (
"backbone.pos_net.{bid}.norm", # outetts
"backbone.pos_net.{bid}.norm", # wavtokenizer
),
MODEL_TENSOR.POS_NET_ATTN_Q: (
"backbone.pos_net.{bid}.q", # outetts
"backbone.pos_net.{bid}.q", # wavtokenizer
),
MODEL_TENSOR.POS_NET_ATTN_K: (
"backbone.pos_net.{bid}.k", # outetts
"backbone.pos_net.{bid}.k", # wavtokenizer
),
MODEL_TENSOR.POS_NET_ATTN_V: (
"backbone.pos_net.{bid}.v", # outetts
"backbone.pos_net.{bid}.v", # wavtokenizer
),
MODEL_TENSOR.POS_NET_ATTN_OUT: (
"backbone.pos_net.{bid}.proj_out", # outetts
"backbone.pos_net.{bid}.proj_out", # wavtokenizer
),
}

View File

@ -197,7 +197,7 @@ enum llm_arch {
LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON,
LLM_ARCH_OUTETTS_VOC,
LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN,
};
@ -254,7 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_OUTETTS_VOC, "outetts-voc" },
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" },
};
@ -1612,7 +1612,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
},
},
{
LLM_ARCH_OUTETTS_VOC,
LLM_ARCH_WAVTOKENIZER_DEC,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
@ -3063,7 +3063,7 @@ struct llama_model {
struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr;
// outetts vocoder
// wavtokenizer decoder
// TODO: dedup
struct ggml_tensor * conv_1d = nullptr;
struct ggml_tensor * conv_1d_b = nullptr;
@ -6443,7 +6443,7 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_OUTETTS_VOC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
} break;
@ -9545,7 +9545,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_OUTETTS_VOC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
@ -16142,7 +16142,7 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_t5_encoder() {
struct ggml_cgraph * build_t5_enc() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
@ -16274,7 +16274,7 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_t5_decoder() {
struct ggml_cgraph * build_t5_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens
@ -17224,7 +17224,7 @@ struct llm_build_context {
return gf;
}
struct ggml_cgraph * build_outetts_voc() {
struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_tensor * cur;
@ -17692,14 +17692,14 @@ static struct ggml_cgraph * llama_build_graph(
case LLM_ARCH_T5:
{
if (lctx.is_encoding) {
result = llm.build_t5_encoder();
result = llm.build_t5_enc();
} else {
result = llm.build_t5_decoder();
result = llm.build_t5_dec();
}
} break;
case LLM_ARCH_T5ENCODER:
{
result = llm.build_t5_encoder();
result = llm.build_t5_enc();
} break;
case LLM_ARCH_JAIS:
{
@ -17721,9 +17721,9 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_chameleon();
} break;
case LLM_ARCH_OUTETTS_VOC:
case LLM_ARCH_WAVTOKENIZER_DEC:
{
result = llm.build_outetts_voc();
result = llm.build_wavtokenizer_dec();
} break;
default:
GGML_ABORT("fatal error");
@ -20904,7 +20904,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS:
case LLM_ARCH_RWKV6:
case LLM_ARCH_OUTETTS_VOC:
case LLM_ARCH_WAVTOKENIZER_DEC:
return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values