mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
examples : fix gpt-neox (#2943)
Co-authored-by: mmnga <mmnga1mmnga@gmail.com>
This commit is contained in:
parent
2753415afd
commit
c42f0ec6b3
@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
|||||||
ggml_tensor * gpt_neox_ff(
|
ggml_tensor * gpt_neox_ff(
|
||||||
const gpt_neox_block &block,
|
const gpt_neox_block &block,
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * inp) {
|
ggml_tensor * inp,
|
||||||
|
const gpt_neox_hparams &hparams) {
|
||||||
|
|
||||||
ggml_tensor * cur = ggml_norm(ctx0, inp);
|
ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
|
||||||
|
|
||||||
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
|
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
|
||||||
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
|
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
|
||||||
@ -753,7 +754,7 @@ bool gpt_neox_eval(
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
cur = ggml_norm(ctx0, inpL);
|
cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
|
||||||
|
|
||||||
cur = ggml_add(ctx0,
|
cur = ggml_add(ctx0,
|
||||||
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
|
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
|
||||||
@ -844,7 +845,7 @@ bool gpt_neox_eval(
|
|||||||
if (hparams.par_res == 0) {
|
if (hparams.par_res == 0) {
|
||||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
||||||
|
|
||||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
|
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
inpL = ggml_add(ctx0, cur, inpFF);
|
inpL = ggml_add(ctx0, cur, inpFF);
|
||||||
@ -853,7 +854,7 @@ bool gpt_neox_eval(
|
|||||||
|
|
||||||
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||||
// note here we pass inpL instead of cur
|
// note here we pass inpL instead of cur
|
||||||
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
|
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
|
||||||
|
|
||||||
// layer input + FF
|
// layer input + FF
|
||||||
cur = ggml_add(ctx0, cur, inpFF);
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
@ -867,7 +868,7 @@ bool gpt_neox_eval(
|
|||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
inpL = ggml_norm(ctx0, inpL);
|
inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
|
||||||
|
|
||||||
// inpL = ln_f_g*inpL + ln_f_b
|
// inpL = ln_f_g*inpL + ln_f_b
|
||||||
inpL = ggml_add(ctx0,
|
inpL = ggml_add(ctx0,
|
||||||
|
42
llama.cpp
42
llama.cpp
@ -325,6 +325,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GPT2,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GPTJ,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_GPTNEOX,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_MPT,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_UNKNOWN,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
static llm_arch llm_arch_from_string(const std::string & name) {
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
||||||
@ -1605,10 +1643,14 @@ static void llm_load_hparams(
|
|||||||
|
|
||||||
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
|
||||||
|
|
||||||
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
||||||
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
||||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
||||||
|
// gpt-j n_rot = rotary_dim
|
||||||
|
}
|
||||||
|
|
||||||
// arch-specific KVs
|
// arch-specific KVs
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
|
Loading…
Reference in New Issue
Block a user