diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0ac64574a..a5bdd5def 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2843,6 +2843,10 @@ class Mamba2Model(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + if name.startswith("model.backbone") or name.startswith("model.lm_head"): + # map Mamba-Codestral-7B-v0.1 tensor names to the names used by Mamba-2 + name = name.removeprefix("model.") + if name.endswith(".dt_bias"): name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" diff --git a/src/llama.cpp b/src/llama.cpp index 5be0ef7a2..fd80361bd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9383,7 +9383,7 @@ static struct ggml_tensor * llm_build_mamba2( // grouped RMS norm y = ggml_reshape_4d(ctx, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); y = llm_build_norm(ctx, y, hparams, - model.layers[il].ssm_norm, NULL, + ggml_reshape_2d(ctx, model.layers[il].ssm_norm, d_inner / n_group, n_group), NULL, LLM_NORM_RMS, cb, il); y = ggml_reshape_3d(ctx, y, d_inner, n_seq_tokens, n_seqs);