tts : outetts-voc -> wavtokenizer-dec
Some checks failed
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled

This commit is contained in:
Georgi Gerganov 2024-12-16 13:51:09 +02:00
parent f1b5b6b5a1
commit 985d59f5e5
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
5 changed files with 198 additions and 198 deletions

View File

@ -2032,9 +2032,9 @@ class Qwen2VLModel(Model):
yield name, data yield name, data
@Model.register("OuteTTSVocoder") @Model.register("WavTokenizerDec")
class OuteTTSVocoderModel(Model): class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.OUTETTS_VOC model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused

View File

@ -1,5 +1,5 @@
# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format # convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder # the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
# #
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten # TODO: this script is LLM-generated and probably very inefficient and should be rewritten
@ -144,7 +144,7 @@ print(f"Metadata has been saved to {index_path}")
config = { config = {
"architectures": [ "architectures": [
"OuteTTSVocoder" "WavTokenizerDec"
], ],
"hidden_size": 1282, "hidden_size": 1282,
"vocab_size": 4096, "vocab_size": 4096,

View File

@ -209,59 +209,59 @@ class GGUFType:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GROK = auto() GROK = auto()
GPT2 = auto() GPT2 = auto()
GPTJ = auto() GPTJ = auto()
GPTNEOX = auto() GPTNEOX = auto()
MPT = auto() MPT = auto()
STARCODER = auto() STARCODER = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
NOMIC_BERT = auto() NOMIC_BERT = auto()
JINA_BERT_V2 = auto() JINA_BERT_V2 = auto()
BLOOM = auto() BLOOM = auto()
STABLELM = auto() STABLELM = auto()
QWEN = auto() QWEN = auto()
QWEN2 = auto() QWEN2 = auto()
QWEN2MOE = auto() QWEN2MOE = auto()
QWEN2VL = auto() QWEN2VL = auto()
PHI2 = auto() PHI2 = auto()
PHI3 = auto() PHI3 = auto()
PLAMO = auto() PLAMO = auto()
CODESHELL = auto() CODESHELL = auto()
ORION = auto() ORION = auto()
INTERNLM2 = auto() INTERNLM2 = auto()
MINICPM = auto() MINICPM = auto()
MINICPM3 = auto() MINICPM3 = auto()
GEMMA = auto() GEMMA = auto()
GEMMA2 = auto() GEMMA2 = auto()
STARCODER2 = auto() STARCODER2 = auto()
RWKV6 = auto() RWKV6 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto() XVERSE = auto()
COMMAND_R = auto() COMMAND_R = auto()
DBRX = auto() DBRX = auto()
OLMO = auto() OLMO = auto()
OLMO2 = auto() OLMO2 = auto()
OLMOE = auto() OLMOE = auto()
OPENELM = auto() OPENELM = auto()
ARCTIC = auto() ARCTIC = auto()
DEEPSEEK = auto() DEEPSEEK = auto()
DEEPSEEK2 = auto() DEEPSEEK2 = auto()
CHATGLM = auto() CHATGLM = auto()
BITNET = auto() BITNET = auto()
T5 = auto() T5 = auto()
T5ENCODER = auto() T5ENCODER = auto()
JAIS = auto() JAIS = auto()
NEMOTRON = auto() NEMOTRON = auto()
EXAONE = auto() EXAONE = auto()
GRANITE = auto() GRANITE = auto()
GRANITE_MOE = auto() GRANITE_MOE = auto()
CHAMELEON = auto() CHAMELEON = auto()
OUTETTS_VOC = auto() WAVTOKENIZER_DEC = auto()
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
@ -390,59 +390,59 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan", MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GROK: "grok", MODEL_ARCH.GROK: "grok",
MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt", MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder", MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion", MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.MINICPM3: "minicpm3", MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2", MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5", MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder", MODEL_ARCH.T5ENCODER: "t5encoder",
MODEL_ARCH.JAIS: "jais", MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite", MODEL_ARCH.GRANITE: "granite",
MODEL_ARCH.GRANITE_MOE: "granitemoe", MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon", MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.OUTETTS_VOC: "outetts-voc", MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -1406,7 +1406,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
], ],
MODEL_ARCH.OUTETTS_VOC: [ MODEL_ARCH.WAVTOKENIZER_DEC: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM, MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.CONV1D, MODEL_TENSOR.CONV1D,

View File

@ -42,7 +42,7 @@ class TensorNameMap:
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
"transformer.norm", # openelm "transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv "rwkv.blocks.0.pre_ln", # rwkv
"backbone.norm", # outetts "backbone.norm", # wavtokenizer
), ),
# Position embeddings # Position embeddings
@ -61,7 +61,7 @@ class TensorNameMap:
"lm_head.linear", # phi2 "lm_head.linear", # phi2
"output_layer", # chatglm "output_layer", # chatglm
"head", # rwkv "head", # rwkv
"head.out", # outetts "head.out", # wavtokenizer
), ),
# Output norm # Output norm
@ -82,7 +82,7 @@ class TensorNameMap:
"transformer.norm", # openelm "transformer.norm", # openelm
"model.norm", # nemotron "model.norm", # nemotron
"rwkv.ln_out", # rwkv "rwkv.ln_out", # rwkv
"backbone.final_layer_norm", # outetts "backbone.final_layer_norm", # wavtokenizer
), ),
# Rope frequencies # Rope frequencies
@ -705,63 +705,63 @@ class TensorNameMap:
############################################################################# #############################################################################
MODEL_TENSOR.CONV_NEXT_DW: ( MODEL_TENSOR.CONV_NEXT_DW: (
"backbone.convnext.{bid}.dwconv", # outetts "backbone.convnext.{bid}.dwconv", # wavtokenizer
), ),
MODEL_TENSOR.CONV_NEXT_NORM: ( MODEL_TENSOR.CONV_NEXT_NORM: (
"backbone.convnext.{bid}.norm", # outetts "backbone.convnext.{bid}.norm", # wavtokenizer
), ),
MODEL_TENSOR.CONV_NEXT_PW1: ( MODEL_TENSOR.CONV_NEXT_PW1: (
"backbone.convnext.{bid}.pwconv1", # outetts "backbone.convnext.{bid}.pwconv1", # wavtokenizer
), ),
MODEL_TENSOR.CONV_NEXT_PW2: ( MODEL_TENSOR.CONV_NEXT_PW2: (
"backbone.convnext.{bid}.pwconv2", # outetts "backbone.convnext.{bid}.pwconv2", # wavtokenizer
), ),
MODEL_TENSOR.CONV_NEXT_GAMMA: ( MODEL_TENSOR.CONV_NEXT_GAMMA: (
"backbone.convnext.{bid}.gamma", # outetts "backbone.convnext.{bid}.gamma", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_CONV1: ( MODEL_TENSOR.POS_NET_CONV1: (
"backbone.pos_net.{bid}.conv1", # outetts "backbone.pos_net.{bid}.conv1", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_CONV2: ( MODEL_TENSOR.POS_NET_CONV2: (
"backbone.pos_net.{bid}.conv2", # outetts "backbone.pos_net.{bid}.conv2", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_NORM: ( MODEL_TENSOR.POS_NET_NORM: (
"backbone.pos_net.{bid}.norm", # outetts "backbone.pos_net.{bid}.norm", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_NORM1: ( MODEL_TENSOR.POS_NET_NORM1: (
"backbone.pos_net.{bid}.norm1", # outetts "backbone.pos_net.{bid}.norm1", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_NORM2: ( MODEL_TENSOR.POS_NET_NORM2: (
"backbone.pos_net.{bid}.norm2", # outetts "backbone.pos_net.{bid}.norm2", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_ATTN_NORM: ( MODEL_TENSOR.POS_NET_ATTN_NORM: (
"backbone.pos_net.{bid}.norm", # outetts "backbone.pos_net.{bid}.norm", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_ATTN_Q: ( MODEL_TENSOR.POS_NET_ATTN_Q: (
"backbone.pos_net.{bid}.q", # outetts "backbone.pos_net.{bid}.q", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_ATTN_K: ( MODEL_TENSOR.POS_NET_ATTN_K: (
"backbone.pos_net.{bid}.k", # outetts "backbone.pos_net.{bid}.k", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_ATTN_V: ( MODEL_TENSOR.POS_NET_ATTN_V: (
"backbone.pos_net.{bid}.v", # outetts "backbone.pos_net.{bid}.v", # wavtokenizer
), ),
MODEL_TENSOR.POS_NET_ATTN_OUT: ( MODEL_TENSOR.POS_NET_ATTN_OUT: (
"backbone.pos_net.{bid}.proj_out", # outetts "backbone.pos_net.{bid}.proj_out", # wavtokenizer
), ),
} }

View File

@ -197,65 +197,65 @@ enum llm_arch {
LLM_ARCH_GRANITE, LLM_ARCH_GRANITE,
LLM_ARCH_GRANITE_MOE, LLM_ARCH_GRANITE_MOE,
LLM_ARCH_CHAMELEON, LLM_ARCH_CHAMELEON,
LLM_ARCH_OUTETTS_VOC, LLM_ARCH_WAVTOKENIZER_DEC,
LLM_ARCH_UNKNOWN, LLM_ARCH_UNKNOWN,
}; };
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = { static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA, "llama" },
{ LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPT2, "gpt2" },
{ LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" }, { LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" }, { LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_BAICHUAN, "baichuan" },
{ LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_STARCODER, "starcoder" },
{ LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_REFACT, "refact" },
{ LLM_ARCH_BERT, "bert" }, { LLM_ARCH_BERT, "bert" },
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
{ LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_BLOOM, "bloom" },
{ LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_STABLELM, "stablelm" },
{ LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN, "qwen" },
{ LLM_ARCH_QWEN2, "qwen2" }, { LLM_ARCH_QWEN2, "qwen2" },
{ LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_QWEN2MOE, "qwen2moe" },
{ LLM_ARCH_QWEN2VL, "qwen2vl" }, { LLM_ARCH_QWEN2VL, "qwen2vl" },
{ LLM_ARCH_PHI2, "phi2" }, { LLM_ARCH_PHI2, "phi2" },
{ LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PHI3, "phi3" },
{ LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_PLAMO, "plamo" },
{ LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_CODESHELL, "codeshell" },
{ LLM_ARCH_ORION, "orion" }, { LLM_ARCH_ORION, "orion" },
{ LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_INTERNLM2, "internlm2" },
{ LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_MINICPM, "minicpm" },
{ LLM_ARCH_MINICPM3, "minicpm3" }, { LLM_ARCH_MINICPM3, "minicpm3" },
{ LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA, "gemma" },
{ LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA2, "gemma2" },
{ LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_STARCODER2, "starcoder2" },
{ LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA, "mamba" },
{ LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_XVERSE, "xverse" },
{ LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COMMAND_R, "command-r" },
{ LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_DBRX, "dbrx" },
{ LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO, "olmo" },
{ LLM_ARCH_OLMO2, "olmo2" }, { LLM_ARCH_OLMO2, "olmo2" },
{ LLM_ARCH_OLMOE, "olmoe" }, { LLM_ARCH_OLMOE, "olmoe" },
{ LLM_ARCH_OPENELM, "openelm" }, { LLM_ARCH_OPENELM, "openelm" },
{ LLM_ARCH_ARCTIC, "arctic" }, { LLM_ARCH_ARCTIC, "arctic" },
{ LLM_ARCH_DEEPSEEK, "deepseek" }, { LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" }, { LLM_ARCH_T5ENCODER, "t5encoder" },
{ LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_JAIS, "jais" },
{ LLM_ARCH_NEMOTRON, "nemotron" }, { LLM_ARCH_NEMOTRON, "nemotron" },
{ LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_EXAONE, "exaone" },
{ LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6, "rwkv6" },
{ LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE, "granite" },
{ LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" },
{ LLM_ARCH_CHAMELEON, "chameleon" }, { LLM_ARCH_CHAMELEON, "chameleon" },
{ LLM_ARCH_OUTETTS_VOC, "outetts-voc" }, { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
{ LLM_ARCH_UNKNOWN, "(unknown)" }, { LLM_ARCH_UNKNOWN, "(unknown)" },
}; };
enum llm_kv { enum llm_kv {
@ -1612,7 +1612,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
}, },
}, },
{ {
LLM_ARCH_OUTETTS_VOC, LLM_ARCH_WAVTOKENIZER_DEC,
{ {
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
@ -3063,7 +3063,7 @@ struct llama_model {
struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out = nullptr;
struct ggml_tensor * cls_out_b = nullptr; struct ggml_tensor * cls_out_b = nullptr;
// outetts vocoder // wavtokenizer decoder
// TODO: dedup // TODO: dedup
struct ggml_tensor * conv_1d = nullptr; struct ggml_tensor * conv_1d = nullptr;
struct ggml_tensor * conv_1d_b = nullptr; struct ggml_tensor * conv_1d_b = nullptr;
@ -6443,7 +6443,7 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN; default: model.type = e_model::MODEL_UNKNOWN;
} }
} break; } break;
case LLM_ARCH_OUTETTS_VOC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
} break; } break;
@ -9545,7 +9545,7 @@ static bool llm_load_tensors(
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
} }
} break; } break;
case LLM_ARCH_OUTETTS_VOC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0); model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
@ -16142,7 +16142,7 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_t5_encoder() { struct ggml_cgraph * build_t5_enc() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens // mutable variable, needed during the last layer of the computation to skip unused tokens
@ -16274,7 +16274,7 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_t5_decoder() { struct ggml_cgraph * build_t5_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
// mutable variable, needed during the last layer of the computation to skip unused tokens // mutable variable, needed during the last layer of the computation to skip unused tokens
@ -17224,7 +17224,7 @@ struct llm_build_context {
return gf; return gf;
} }
struct ggml_cgraph * build_outetts_voc() { struct ggml_cgraph * build_wavtokenizer_dec() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
struct ggml_tensor * cur; struct ggml_tensor * cur;
@ -17692,14 +17692,14 @@ static struct ggml_cgraph * llama_build_graph(
case LLM_ARCH_T5: case LLM_ARCH_T5:
{ {
if (lctx.is_encoding) { if (lctx.is_encoding) {
result = llm.build_t5_encoder(); result = llm.build_t5_enc();
} else { } else {
result = llm.build_t5_decoder(); result = llm.build_t5_dec();
} }
} break; } break;
case LLM_ARCH_T5ENCODER: case LLM_ARCH_T5ENCODER:
{ {
result = llm.build_t5_encoder(); result = llm.build_t5_enc();
} break; } break;
case LLM_ARCH_JAIS: case LLM_ARCH_JAIS:
{ {
@ -17721,9 +17721,9 @@ static struct ggml_cgraph * llama_build_graph(
{ {
result = llm.build_chameleon(); result = llm.build_chameleon();
} break; } break;
case LLM_ARCH_OUTETTS_VOC: case LLM_ARCH_WAVTOKENIZER_DEC:
{ {
result = llm.build_outetts_voc(); result = llm.build_wavtokenizer_dec();
} break; } break;
default: default:
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -20904,7 +20904,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
case LLM_ARCH_T5ENCODER: case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS: case LLM_ARCH_JAIS:
case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6:
case LLM_ARCH_OUTETTS_VOC: case LLM_ARCH_WAVTOKENIZER_DEC:
return LLAMA_ROPE_TYPE_NONE; return LLAMA_ROPE_TYPE_NONE;
// use what we call a normal RoPE, operating on pairs of consecutive head values // use what we call a normal RoPE, operating on pairs of consecutive head values