py : fix StableLM conversion after config.json changes (#5703)

* Fix issues during StableLM models conversion

* Fix hard coded layer_norm_eps

* Support layer_norm_eps for LlavaStableLM

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Add missing parenthesis

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Support rotary_factor for LlavaStableLM

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* fix typo

* Add StableLMEpochForCausalLM for safety

Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>

* Add StableLMEpochForCausalLM for safety 2

Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>

---------

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
Co-authored-by: compilade <113953597+compilade@users.noreply.github.com>
This commit is contained in:
Anas Ahouzi 2024-02-25 10:54:04 +01:00 committed by GitHub
parent 9e359a4f47
commit 69917dfa55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -192,7 +192,7 @@ class Model:
return RefactModel return RefactModel
if model_architecture == "PersimmonForCausalLM": if model_architecture == "PersimmonForCausalLM":
return PersimmonModel return PersimmonModel
if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return StableLMModel return StableLMModel
if model_architecture == "QWenLMHeadModel": if model_architecture == "QWenLMHeadModel":
return QwenModel return QwenModel
@ -253,7 +253,7 @@ class Model:
return gguf.MODEL_ARCH.REFACT return gguf.MODEL_ARCH.REFACT
if arch == "PersimmonForCausalLM": if arch == "PersimmonForCausalLM":
return gguf.MODEL_ARCH.PERSIMMON return gguf.MODEL_ARCH.PERSIMMON
if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
return gguf.MODEL_ARCH.STABLELM return gguf.MODEL_ARCH.STABLELM
if arch == "QWenLMHeadModel": if arch == "QWenLMHeadModel":
return gguf.MODEL_ARCH.QWEN return gguf.MODEL_ARCH.QWEN
@ -1074,10 +1074,11 @@ class StableLMModel(Model):
self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count(hparams["num_attention_heads"])
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
self.gguf_writer.add_layer_norm_eps(1e-5) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
class MixtralModel(Model): class MixtralModel(Model):