mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
llama : add support for Nomic Embed (#5468)
This commit is contained in:
parent
c4e6dd59e4
commit
ea9c8e1143
@ -10,7 +10,7 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -25,15 +25,6 @@ import gguf
|
|||||||
from convert import HfVocab
|
from convert import HfVocab
|
||||||
|
|
||||||
|
|
||||||
# check for any of the given keys in the dictionary and return the value of the first key found
|
|
||||||
def get_key_opts(d, keys):
|
|
||||||
for k in keys:
|
|
||||||
if k in d:
|
|
||||||
return d[k]
|
|
||||||
print(f"Could not find any of {keys}")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
###### MODEL DEFINITIONS ######
|
###### MODEL DEFINITIONS ######
|
||||||
|
|
||||||
class SentencePieceTokenTypes(IntEnum):
|
class SentencePieceTokenTypes(IntEnum):
|
||||||
@ -58,6 +49,15 @@ class Model:
|
|||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.model_arch = self._get_model_architecture()
|
self.model_arch = self._get_model_architecture()
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
||||||
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
|
def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
|
||||||
|
key = next((k for k in keys if k in self.hparams), None)
|
||||||
|
if key is not None:
|
||||||
|
return self.hparams[key]
|
||||||
|
if optional:
|
||||||
|
return None
|
||||||
|
raise KeyError(f"could not find any of: {keys}")
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
self._set_vocab_gpt2()
|
self._set_vocab_gpt2()
|
||||||
@ -79,28 +79,33 @@ class Model:
|
|||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
self.gguf_writer.add_name(self.dir_model.name)
|
||||||
self.gguf_writer.add_block_count(self.hparams.get(
|
self.gguf_writer.add_block_count(self.block_count)
|
||||||
"n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
|
|
||||||
))
|
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
|
||||||
if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
|
|
||||||
self.gguf_writer.add_context_length(n_ctx)
|
self.gguf_writer.add_context_length(n_ctx)
|
||||||
if (n_embd := self.hparams.get("hidden_size")) is not None:
|
|
||||||
self.gguf_writer.add_embedding_length(n_embd)
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
if (n_ff := self.hparams.get("intermediate_size")) is not None:
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
|
|
||||||
|
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
|
||||||
self.gguf_writer.add_feed_forward_length(n_ff)
|
self.gguf_writer.add_feed_forward_length(n_ff)
|
||||||
if (n_head := self.hparams.get("num_attention_heads")) is not None:
|
|
||||||
self.gguf_writer.add_head_count(n_head)
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
|
self.gguf_writer.add_head_count(n_head)
|
||||||
|
|
||||||
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
|
||||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||||
|
|
||||||
if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
|
||||||
|
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None:
|
||||||
|
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
|
||||||
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
if (n_experts := self.hparams.get("num_local_experts")) is not None:
|
||||||
self.gguf_writer.add_expert_count(n_experts)
|
self.gguf_writer.add_expert_count(n_experts)
|
||||||
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
|
||||||
self.gguf_writer.add_expert_used_count(n_experts_used)
|
self.gguf_writer.add_expert_used_count(n_experts_used)
|
||||||
|
|
||||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||||
@ -211,6 +216,8 @@ class Model:
|
|||||||
return MiniCPMModel
|
return MiniCPMModel
|
||||||
if model_architecture == "BertModel":
|
if model_architecture == "BertModel":
|
||||||
return BertModel
|
return BertModel
|
||||||
|
if model_architecture == "NomicBertModel":
|
||||||
|
return NomicBertModel
|
||||||
return Model
|
return Model
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
def _is_model_safetensors(self) -> bool:
|
||||||
@ -268,6 +275,8 @@ class Model:
|
|||||||
return gguf.MODEL_ARCH.MINICPM
|
return gguf.MODEL_ARCH.MINICPM
|
||||||
if arch == "BertModel":
|
if arch == "BertModel":
|
||||||
return gguf.MODEL_ARCH.BERT
|
return gguf.MODEL_ARCH.BERT
|
||||||
|
if arch == "NomicBertModel":
|
||||||
|
return gguf.MODEL_ARCH.NOMIC_BERT
|
||||||
|
|
||||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||||
|
|
||||||
@ -1297,21 +1306,21 @@ class GPT2Model(Model):
|
|||||||
|
|
||||||
class Phi2Model(Model):
|
class Phi2Model(Model):
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
|
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
|
||||||
|
|
||||||
rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
|
rot_pct = self.find_hparam(["partial_rotary_factor"])
|
||||||
n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
|
n_embd = self.find_hparam(["hidden_size", "n_embd"])
|
||||||
n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
|
n_head = self.find_hparam(["num_attention_heads", "n_head"])
|
||||||
|
|
||||||
self.gguf_writer.add_name("Phi2")
|
self.gguf_writer.add_name("Phi2")
|
||||||
self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
|
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
|
||||||
|
|
||||||
self.gguf_writer.add_embedding_length(n_embd)
|
self.gguf_writer.add_embedding_length(n_embd)
|
||||||
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
self.gguf_writer.add_feed_forward_length(4 * n_embd)
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_head_count(n_head)
|
self.gguf_writer.add_head_count(n_head)
|
||||||
self.gguf_writer.add_head_count_kv(n_head)
|
self.gguf_writer.add_head_count_kv(n_head)
|
||||||
self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
|
self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
|
||||||
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
self.gguf_writer.add_add_bos_token(False)
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
@ -1636,20 +1645,12 @@ in chat mode so that the conversation can end normally.")
|
|||||||
class BertModel(Model):
|
class BertModel(Model):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.block_count = self.hparams["num_hidden_layers"]
|
self.vocab_size = None
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
# TODO(cebtenzzre): merge with parent class
|
super().set_gguf_parameters()
|
||||||
self.gguf_writer.add_name(self.dir_model.name)
|
|
||||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
|
||||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
|
||||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
|
||||||
self.gguf_writer.add_block_count(self.block_count)
|
|
||||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
self.gguf_writer.add_pooling_layer(True)
|
self.gguf_writer.add_pooling_layer(True)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
path = self.dir_model
|
||||||
@ -1659,6 +1660,7 @@ class BertModel(Model):
|
|||||||
vocab = HfVocab(path, added_tokens_path)
|
vocab = HfVocab(path, added_tokens_path)
|
||||||
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
tokens, scores, toktypes = zip(*vocab.all_tokens())
|
||||||
assert len(tokens) == vocab.vocab_size
|
assert len(tokens) == vocab.vocab_size
|
||||||
|
self.vocab_size = vocab.vocab_size
|
||||||
|
|
||||||
# we need this to validate the size of the token_type embeddings
|
# we need this to validate the size of the token_type embeddings
|
||||||
# though currently we are passing all zeros to the token_type embeddings
|
# though currently we are passing all zeros to the token_type embeddings
|
||||||
@ -1672,7 +1674,7 @@ class BertModel(Model):
|
|||||||
if tok.startswith(b"##"):
|
if tok.startswith(b"##"):
|
||||||
return tok[2:]
|
return tok[2:]
|
||||||
return b"\xe2\x96\x81" + tok
|
return b"\xe2\x96\x81" + tok
|
||||||
tokens = [phantom(t, y) for t, y in zip(tokens, toktypes)]
|
tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
|
||||||
|
|
||||||
# set up bos and eos tokens (cls and sep)
|
# set up bos and eos tokens (cls and sep)
|
||||||
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
|
||||||
@ -1724,6 +1726,43 @@ class BertModel(Model):
|
|||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
class NomicBertModel(BertModel):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||||
|
self.hparams["n_ctx"] = 2048
|
||||||
|
|
||||||
|
# SwigLU activation
|
||||||
|
assert self.hparams["activation_function"] == "swiglu"
|
||||||
|
# this doesn't do anything in the HF version
|
||||||
|
assert self.hparams["causal"] is False
|
||||||
|
# no bias tensors
|
||||||
|
assert self.hparams["qkv_proj_bias"] is False
|
||||||
|
assert self.hparams["mlp_fc1_bias"] is False
|
||||||
|
assert self.hparams["mlp_fc2_bias"] is False
|
||||||
|
# norm at end of layer
|
||||||
|
assert self.hparams["prenorm"] is False
|
||||||
|
# standard RoPE
|
||||||
|
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||||
|
assert self.hparams["rotary_emb_interleaved"] is False
|
||||||
|
assert self.hparams["rotary_emb_scale_base"] is None
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||||
|
|
||||||
|
def get_tensors(self):
|
||||||
|
assert self.vocab_size is not None
|
||||||
|
for name, data in super().get_tensors():
|
||||||
|
# Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
|
||||||
|
if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
|
||||||
|
rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
|
||||||
|
assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
|
||||||
|
data = data[:self.vocab_size, :]
|
||||||
|
yield name, data
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -87,27 +87,28 @@ class Keys:
|
|||||||
|
|
||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GPT2 = auto()
|
GPT2 = auto()
|
||||||
GPTJ = auto()
|
GPTJ = auto()
|
||||||
GPTNEOX = auto()
|
GPTNEOX = auto()
|
||||||
MPT = auto()
|
MPT = auto()
|
||||||
STARCODER = auto()
|
STARCODER = auto()
|
||||||
PERSIMMON = auto()
|
PERSIMMON = auto()
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
BLOOM = auto()
|
NOMIC_BERT = auto()
|
||||||
STABLELM = auto()
|
BLOOM = auto()
|
||||||
QWEN = auto()
|
STABLELM = auto()
|
||||||
QWEN2 = auto()
|
QWEN = auto()
|
||||||
PHI2 = auto()
|
QWEN2 = auto()
|
||||||
PLAMO = auto()
|
PHI2 = auto()
|
||||||
CODESHELL = auto()
|
PLAMO = auto()
|
||||||
ORION = auto()
|
CODESHELL = auto()
|
||||||
|
ORION = auto()
|
||||||
INTERNLM2 = auto()
|
INTERNLM2 = auto()
|
||||||
MINICPM = auto()
|
MINICPM = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
@ -153,6 +154,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
@ -282,6 +284,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM,
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.NOMIC_BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
MODEL_ARCH.MPT: [
|
MODEL_ARCH.MPT: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM,
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
@ -15,7 +15,7 @@ class TensorNameMap:
|
|||||||
"word_embeddings", # bloom
|
"word_embeddings", # bloom
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
"language_model.embedding.word_embeddings", # persimmon
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
"wte", # gpt2
|
"wte", # gpt2
|
||||||
"transformer.embd.wte", # phi2
|
"transformer.embd.wte", # phi2
|
||||||
@ -24,13 +24,14 @@ class TensorNameMap:
|
|||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
"embeddings.token_type_embeddings", # bert
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Normalization of token embeddings
|
# Normalization of token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
"word_embeddings_layernorm", # bloom
|
"word_embeddings_layernorm", # bloom
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
|
"emb_ln", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
@ -103,6 +104,7 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
||||||
"h.{bid}.attn.c_attn", # gpt2
|
"h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
@ -152,11 +154,13 @@ class TensorNameMap:
|
|||||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm1", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
@ -205,6 +209,7 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.mlp.fc1", # phi2
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
@ -224,6 +229,7 @@ class TensorNameMap:
|
|||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
@ -249,6 +255,7 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.mlp.fc2", # phi2
|
"model.layers.{bid}.mlp.fc2", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||||
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
@ -272,6 +279,7 @@ class TensorNameMap:
|
|||||||
|
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
201
llama.cpp
201
llama.cpp
@ -197,6 +197,7 @@ enum llm_arch {
|
|||||||
LLM_ARCH_PERSIMMON,
|
LLM_ARCH_PERSIMMON,
|
||||||
LLM_ARCH_REFACT,
|
LLM_ARCH_REFACT,
|
||||||
LLM_ARCH_BERT,
|
LLM_ARCH_BERT,
|
||||||
|
LLM_ARCH_NOMIC_BERT,
|
||||||
LLM_ARCH_BLOOM,
|
LLM_ARCH_BLOOM,
|
||||||
LLM_ARCH_STABLELM,
|
LLM_ARCH_STABLELM,
|
||||||
LLM_ARCH_QWEN,
|
LLM_ARCH_QWEN,
|
||||||
@ -211,27 +212,28 @@ enum llm_arch {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_LLAMA, "llama" },
|
{ LLM_ARCH_LLAMA, "llama" },
|
||||||
{ LLM_ARCH_FALCON, "falcon" },
|
{ LLM_ARCH_FALCON, "falcon" },
|
||||||
{ LLM_ARCH_GPT2, "gpt2" },
|
{ LLM_ARCH_GPT2, "gpt2" },
|
||||||
{ LLM_ARCH_GPTJ, "gptj" },
|
{ LLM_ARCH_GPTJ, "gptj" },
|
||||||
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
||||||
{ LLM_ARCH_MPT, "mpt" },
|
{ LLM_ARCH_MPT, "mpt" },
|
||||||
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
||||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||||
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
||||||
{ LLM_ARCH_REFACT, "refact" },
|
{ LLM_ARCH_REFACT, "refact" },
|
||||||
{ LLM_ARCH_BERT, "bert" },
|
{ LLM_ARCH_BERT, "bert" },
|
||||||
{ LLM_ARCH_BLOOM, "bloom" },
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
||||||
{ LLM_ARCH_STABLELM, "stablelm" },
|
{ LLM_ARCH_BLOOM, "bloom" },
|
||||||
{ LLM_ARCH_QWEN, "qwen" },
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
||||||
{ LLM_ARCH_QWEN2, "qwen2" },
|
{ LLM_ARCH_QWEN, "qwen" },
|
||||||
{ LLM_ARCH_PHI2, "phi2" },
|
{ LLM_ARCH_QWEN2, "qwen2" },
|
||||||
{ LLM_ARCH_PLAMO, "plamo" },
|
{ LLM_ARCH_PHI2, "phi2" },
|
||||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
{ LLM_ARCH_PLAMO, "plamo" },
|
||||||
{ LLM_ARCH_ORION, "orion" },
|
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
{ LLM_ARCH_ORION, "orion" },
|
||||||
{ LLM_ARCH_MINICPM, "minicpm" },
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||||
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llm_kv {
|
enum llm_kv {
|
||||||
@ -375,6 +377,7 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_ATTN_OUT,
|
LLM_TENSOR_ATTN_OUT,
|
||||||
LLM_TENSOR_ATTN_NORM,
|
LLM_TENSOR_ATTN_NORM,
|
||||||
LLM_TENSOR_ATTN_NORM_2,
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
LLM_TENSOR_FFN_GATE_INP,
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
@ -387,6 +390,7 @@ enum llm_tensor {
|
|||||||
LLM_TENSOR_FFN_UP_EXP,
|
LLM_TENSOR_FFN_UP_EXP,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||||
@ -552,12 +556,27 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_output_norm" },
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.layer_output_norm" },
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_NOMIC_BERT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
@ -1485,6 +1504,7 @@ enum e_model {
|
|||||||
MODEL_22M,
|
MODEL_22M,
|
||||||
MODEL_33M,
|
MODEL_33M,
|
||||||
MODEL_109M,
|
MODEL_109M,
|
||||||
|
MODEL_137M,
|
||||||
MODEL_335M,
|
MODEL_335M,
|
||||||
MODEL_0_5B,
|
MODEL_0_5B,
|
||||||
MODEL_1B,
|
MODEL_1B,
|
||||||
@ -1620,6 +1640,8 @@ struct llama_layer {
|
|||||||
struct ggml_tensor * attn_q_norm_b;
|
struct ggml_tensor * attn_q_norm_b;
|
||||||
struct ggml_tensor * attn_k_norm;
|
struct ggml_tensor * attn_k_norm;
|
||||||
struct ggml_tensor * attn_k_norm_b;
|
struct ggml_tensor * attn_k_norm_b;
|
||||||
|
struct ggml_tensor * attn_out_norm;
|
||||||
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
|
|
||||||
// attention
|
// attention
|
||||||
struct ggml_tensor * wq;
|
struct ggml_tensor * wq;
|
||||||
@ -1638,6 +1660,8 @@ struct llama_layer {
|
|||||||
// normalization
|
// normalization
|
||||||
struct ggml_tensor * ffn_norm;
|
struct ggml_tensor * ffn_norm;
|
||||||
struct ggml_tensor * ffn_norm_b;
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
struct ggml_tensor * layer_out_norm;
|
||||||
|
struct ggml_tensor * layer_out_norm_b;
|
||||||
|
|
||||||
// ff
|
// ff
|
||||||
struct ggml_tensor * ffn_gate; // w1
|
struct ggml_tensor * ffn_gate; // w1
|
||||||
@ -2855,6 +2879,11 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||||||
|
|
||||||
static const char * llama_model_type_name(e_model type) {
|
static const char * llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
case MODEL_22M: return "22M";
|
||||||
|
case MODEL_33M: return "33M";
|
||||||
|
case MODEL_109M: return "109M";
|
||||||
|
case MODEL_137M: return "137M";
|
||||||
|
case MODEL_0_5B: return "0.5B";
|
||||||
case MODEL_1B: return "1B";
|
case MODEL_1B: return "1B";
|
||||||
case MODEL_2B: return "2B";
|
case MODEL_2B: return "2B";
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B: return "3B";
|
||||||
@ -3073,6 +3102,17 @@ static void llm_load_hparams(
|
|||||||
model.type = e_model::MODEL_335M; break; // bge-large
|
model.type = e_model::MODEL_335M; break; // bge-large
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
|
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
|
||||||
|
|
||||||
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
||||||
|
model.type = e_model::MODEL_137M;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
@ -3875,10 +3915,14 @@ static bool llm_load_tensors(
|
|||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
{
|
{
|
||||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
|
||||||
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
|
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
||||||
|
}
|
||||||
|
|
||||||
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
||||||
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
||||||
|
|
||||||
@ -3888,29 +3932,38 @@ static bool llm_load_tensors(
|
|||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
||||||
|
|
||||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||||
|
} else {
|
||||||
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||||
|
}
|
||||||
|
|
||||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
||||||
|
|
||||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
|
||||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||||
|
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||||
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
} else {
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
}
|
||||||
|
|
||||||
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
||||||
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BLOOM:
|
case LLM_ARCH_BLOOM:
|
||||||
@ -5773,6 +5826,7 @@ struct llm_build_context {
|
|||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
@ -5789,7 +5843,9 @@ struct llm_build_context {
|
|||||||
// token types are hardcoded to zero ("Sentence A")
|
// token types are hardcoded to zero ("Sentence A")
|
||||||
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
|
||||||
inpL = ggml_add(ctx0, inpL, type_row0);
|
inpL = ggml_add(ctx0, inpL, type_row0);
|
||||||
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
|
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
|
||||||
|
}
|
||||||
cb(inpL, "inp_embd", -1);
|
cb(inpL, "inp_embd", -1);
|
||||||
|
|
||||||
// embed layer norm
|
// embed layer norm
|
||||||
@ -5805,7 +5861,7 @@ struct llm_build_context {
|
|||||||
struct ggml_tensor * cur = inpL;
|
struct ggml_tensor * cur = inpL;
|
||||||
|
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
struct ggml_tensor * Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
@ -5818,6 +5874,37 @@ struct llm_build_context {
|
|||||||
// seems like we just need to do this for Q?
|
// seems like we just need to do this for Q?
|
||||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
cb(cur, "kqv_out", il);
|
||||||
|
} else {
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||||
|
cb(cur, "wqkv", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||||
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||||
|
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
|
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
@ -5828,25 +5915,34 @@ struct llm_build_context {
|
|||||||
cur = ggml_add(ctx0, cur, inpL);
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
// attention layer norm
|
// attention layer norm
|
||||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, cb, il);
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = cur;
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
if (model.arch == LLM_ARCH_BERT) {
|
||||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
NULL, NULL,
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
NULL, NULL,
|
||||||
NULL,
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
NULL,
|
||||||
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||||
|
} else {
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
}
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
// attentions bypass the intermediate layer
|
// attentions bypass the intermediate layer
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
|
||||||
// output layer norm
|
// output layer norm
|
||||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, cb, il);
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
inpL = cur;
|
inpL = cur;
|
||||||
@ -7289,6 +7385,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
result = llm.build_refact();
|
result = llm.build_refact();
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_BERT:
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
{
|
{
|
||||||
result = llm.build_bert();
|
result = llm.build_bert();
|
||||||
} break;
|
} break;
|
||||||
|
Loading…
Reference in New Issue
Block a user