mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-31 22:04:35 +00:00
convert-hf : support Mini-Jamba conversion
This commit is contained in:
parent
ea2e63e9d2
commit
fc59407efe
@ -2393,6 +2393,16 @@ class JambaModel(Model):
|
|||||||
|
|
||||||
return "gpt-2"
|
return "gpt-2"
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
|
# Using Jamba's tokenizer.json causes errors on model load
|
||||||
|
# (something about "byte not found in vocab"),
|
||||||
|
# but there's a working tokenizer.model
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
else:
|
||||||
|
# Some Jamba models only have a tokenizer.json, which works.
|
||||||
|
self._set_vocab_gpt2()
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
|
||||||
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
|
||||||
@ -2412,7 +2422,7 @@ class JambaModel(Model):
|
|||||||
|
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"]))
|
||||||
self.gguf_writer.add_embedding_length(d_model)
|
self.gguf_writer.add_embedding_length(d_model)
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||||
@ -2430,6 +2440,15 @@ class JambaModel(Model):
|
|||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
||||||
|
# Mini-Jamba
|
||||||
|
name = name.replace(".moe.", ".feed_forward.")
|
||||||
|
if bid is not None:
|
||||||
|
moe_offset = self.hparams["expert_layer_offset"]
|
||||||
|
moe_period = self.hparams["expert_layer_period"]
|
||||||
|
|
||||||
|
if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
|
||||||
|
name = name.replace(".experts.0.", ".")
|
||||||
|
|
||||||
# process the experts separately
|
# process the experts separately
|
||||||
if ".feed_forward.experts." in name:
|
if ".feed_forward.experts." in name:
|
||||||
n_experts = self.hparams["num_experts"]
|
n_experts = self.hparams["num_experts"]
|
||||||
|
@ -207,6 +207,7 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.ffn_norm", # internlm2
|
"model.layers.{bid}.ffn_norm", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
"model.layers.{bid}.pre_ff_layernorm", # jamba
|
"model.layers.{bid}.pre_ff_layernorm", # jamba
|
||||||
|
"model.layers.{bid}.pre_moe_layernorm", # mini-jamba
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
@ -390,10 +391,12 @@ class TensorNameMap:
|
|||||||
|
|
||||||
MODEL_TENSOR.SSM_B_NORM: (
|
MODEL_TENSOR.SSM_B_NORM: (
|
||||||
"model.layers.{bid}.mamba.b_layernorm", # jamba
|
"model.layers.{bid}.mamba.b_layernorm", # jamba
|
||||||
|
"model.layers.{bid}.mamba.B_layernorm", # mini-jamba
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_C_NORM: (
|
MODEL_TENSOR.SSM_C_NORM: (
|
||||||
"model.layers.{bid}.mamba.c_layernorm", # jamba
|
"model.layers.{bid}.mamba.c_layernorm", # jamba
|
||||||
|
"model.layers.{bid}.mamba.C_layernorm", # mini-jamba
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_D: (
|
MODEL_TENSOR.SSM_D: (
|
||||||
|
Loading…
Reference in New Issue
Block a user