From 69917dfa55674c608360638bb4d6a12a315e2810 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Sun, 25 Feb 2024 10:54:04 +0100 Subject: [PATCH] py : fix StableLM conversion after config.json changes (#5703) * Fix issues during StableLM models conversion * Fix hard coded layer_norm_eps * Support layer_norm_eps for LlavaStableLM Co-authored-by: Jared Van Bortel * Add missing parenthesis Co-authored-by: Jared Van Bortel * Support rotary_factor for LlavaStableLM Co-authored-by: Jared Van Bortel * fix typo * Add StableLMEpochForCausalLM for safety Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> * Add StableLMEpochForCausalLM for safety 2 Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --------- Co-authored-by: Jared Van Bortel Co-authored-by: Jared Van Bortel Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --- convert-hf-to-gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 32d54b45f..ae30b2a76 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -192,7 +192,7 @@ class Model: return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel @@ -253,7 +253,7 @@ class Model: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN @@ -1074,10 +1074,11 @@ class StableLMModel(Model): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) class MixtralModel(Model):