mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 21:39:52 +00:00
llama : add jina v2 base code (#7596)
* feat: add changes to handle jina v2 base code * fix: do not complicate things * fix: fix the usage of the code model * fix: fix comments * fix: fix linting issues * fix: remove ollama patches * style : minor --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
2d08b7fbb4
commit
f5d7b268ec
@ -83,6 +83,7 @@ models = [
|
|||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
|
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -475,6 +475,9 @@ class Model:
|
|||||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||||
res = "smaug-bpe"
|
res = "smaug-bpe"
|
||||||
|
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||||
|
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||||
|
res = "jina-v2-code"
|
||||||
|
|
||||||
if res is None:
|
if res is None:
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
@ -2452,11 +2455,13 @@ class JinaBertV2Model(BertModel):
|
|||||||
|
|
||||||
def get_tensors(self):
|
def get_tensors(self):
|
||||||
for name, data in super().get_tensors():
|
for name, data in super().get_tensors():
|
||||||
if 'gated_layers' in name:
|
if 'gated_layer' in name:
|
||||||
d1 = data[:self.intermediate_size, :]
|
d1 = data[:self.intermediate_size, :]
|
||||||
name1 = name.replace('gated_layers', 'gated_layers_w')
|
name1 = name.replace('gated_layers', 'gated_layers_w')
|
||||||
|
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
|
||||||
d2 = data[self.intermediate_size:, :]
|
d2 = data[self.intermediate_size:, :]
|
||||||
name2 = name.replace('gated_layers', 'gated_layers_v')
|
name2 = name.replace('gated_layers', 'gated_layers_v')
|
||||||
|
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
|
||||||
yield name1, d1
|
yield name1, d1
|
||||||
yield name2, d2
|
yield name2, d2
|
||||||
continue
|
continue
|
||||||
|
@ -415,6 +415,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||||
MODEL_TENSOR.TOKEN_TYPES,
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM,
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
||||||
MODEL_TENSOR.ATTN_Q,
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.ATTN_Q_NORM,
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
@ -102,6 +102,7 @@ class TensorNameMap:
|
|||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||||
|
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
@ -311,6 +312,7 @@ class TensorNameMap:
|
|||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||||
"model.layers.{bid}.residual_mlp.w2", # arctic
|
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||||
|
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
@ -350,6 +352,7 @@ class TensorNameMap:
|
|||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
|
||||||
|
"encoder.layer.{bid}.layer_norm_2" # jina-v2-code
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
|
17
llama.cpp
17
llama.cpp
@ -704,6 +704,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||||
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
|
||||||
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
||||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||||
@ -4653,8 +4654,7 @@ static void llm_load_vocab(
|
|||||||
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
||||||
LLAMA_LOG_WARN("%s: \n", __func__);
|
LLAMA_LOG_WARN("%s: \n", __func__);
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (tokenizer_pre == "default") {
|
||||||
tokenizer_pre == "default") {
|
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@ -4681,7 +4681,8 @@ static void llm_load_vocab(
|
|||||||
tokenizer_pre == "jina-es" ||
|
tokenizer_pre == "jina-es" ||
|
||||||
tokenizer_pre == "jina-de" ||
|
tokenizer_pre == "jina-de" ||
|
||||||
tokenizer_pre == "jina-v2-es" ||
|
tokenizer_pre == "jina-v2-es" ||
|
||||||
tokenizer_pre == "jina-v2-de") {
|
tokenizer_pre == "jina-v2-de" ||
|
||||||
|
tokenizer_pre == "jina-v2-code") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "refact") {
|
tokenizer_pre == "refact") {
|
||||||
@ -5515,7 +5516,7 @@ static bool llm_load_tensors(
|
|||||||
|
|
||||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
||||||
@ -5556,6 +5557,9 @@ static bool llm_load_tensors(
|
|||||||
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
||||||
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
||||||
|
|
||||||
|
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
|
||||||
@ -8519,6 +8523,11 @@ struct llm_build_context {
|
|||||||
// attention layer norm
|
// attention layer norm
|
||||||
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
|
||||||
|
|
||||||
|
if (model.layers[il].attn_norm_2 != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = cur;
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user