diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 28a43c54f..a42458e63 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2393,6 +2393,16 @@ class JambaModel(Model): return "gpt-2" + def set_vocab(self): + if (self.dir_model / "tokenizer.model").is_file(): + # Using Jamba's tokenizer.json causes errors on model load + # (something about "byte not found in vocab"), + # but there's a working tokenizer.model + self._set_vocab_sentencepiece() + else: + # Some Jamba models only have a tokenizer.json, which works. + self._set_vocab_gpt2() + def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 @@ -2412,7 +2422,7 @@ class JambaModel(Model): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_context_length(self.find_hparam(["max_position_embeddings", "n_ctx"])) self.gguf_writer.add_embedding_length(d_model) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) @@ -2430,6 +2440,15 @@ class JambaModel(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Mini-Jamba + name = name.replace(".moe.", ".feed_forward.") + if bid is not None: + moe_offset = self.hparams["expert_layer_offset"] + moe_period = self.hparams["expert_layer_period"] + + if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): + name = name.replace(".experts.0.", ".") + # process the experts separately if ".feed_forward.experts." in name: n_experts = self.hparams["num_experts"] diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b71bf1ecd..c81600151 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -207,6 +207,7 @@ class TensorNameMap: "model.layers.{bid}.ffn_norm", # internlm2 "transformer.decoder_layer.{bid}.rms_norm_2", # Grok "model.layers.{bid}.pre_ff_layernorm", # jamba + "model.layers.{bid}.pre_moe_layernorm", # mini-jamba ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -390,10 +391,12 @@ class TensorNameMap: MODEL_TENSOR.SSM_B_NORM: ( "model.layers.{bid}.mamba.b_layernorm", # jamba + "model.layers.{bid}.mamba.B_layernorm", # mini-jamba ), MODEL_TENSOR.SSM_C_NORM: ( "model.layers.{bid}.mamba.c_layernorm", # jamba + "model.layers.{bid}.mamba.C_layernorm", # mini-jamba ), MODEL_TENSOR.SSM_D: (