gptneox-main.cpp : n_layer --> n_block

This commit is contained in:
klosax 2023-08-12 23:50:58 +02:00 committed by GitHub
parent e606ffeaee
commit 5e58ffa1ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -24,13 +24,13 @@ struct gpt_neox_hparams {
uint32_t n_ctx = 0;
uint32_t n_embd = 0;
uint32_t n_head = 0;
uint32_t n_layer = 0;
uint32_t n_block = 0;
uint32_t n_rot = 0; // rotary_pct * (n_embd / n_head)
bool par_res = true;
float norm_eps = 1e-5;
};
struct gpt_neox_layer {
struct gpt_neox_block {
// pre normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
@ -65,7 +65,7 @@ struct gpt_neox_model {
struct ggml_tensor * lmh_g; // language model head
std::vector<gpt_neox_layer> layers;
std::vector<gpt_neox_block> blocks;
// key + value memory
struct ggml_tensor * memory_k;
@ -415,7 +415,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } }
if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.layer_count");
if (keyidx != -1) { hparams.n_layer = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } }
if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } }
if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count");
if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; } }
@ -434,7 +434,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_block = %d\n", __func__, hparams.n_block);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps);
@ -545,9 +545,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
// prepare memory for the weights
{
const int n_layer = model.hparams.n_layer;
const int n_block = model.hparams.n_block;
model.layers.resize(n_layer);
model.blocks.resize(n_block);
model.wte = ggml_get_tensor(ctx, "transformer.token_embd.weight");
model.ln_f_g = ggml_get_tensor(ctx, "transformer.output_norm.weight");
@ -560,47 +560,47 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
model.tensors["transformer.output_norm.bias"] = model.ln_f_b;
model.tensors["transformer.output.weight"] = model.lmh_g;
for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i];
for (int i = 0; i < n_block; ++i) {
auto & block = model.blocks[i];
std::string blocknamestart = "transformer.blocks." + std::to_string(i) + ".";
layer.ln_1_g = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
layer.ln_1_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
block.ln_1_g = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
block.ln_1_b = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
layer.c_attn_attn_w = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
layer.c_attn_attn_b = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" );
block.c_attn_attn_w = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
block.c_attn_attn_b = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" );
layer.c_attn_proj_w = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
layer.c_attn_proj_b = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" );
block.c_attn_proj_w = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
block.c_attn_proj_b = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" );
layer.ln_2_g = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" );
layer.ln_2_b = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias");
block.ln_2_g = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" );
block.ln_2_b = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias");
layer.c_mlp_fc_w = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
layer.c_mlp_fc_b = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" );
block.c_mlp_fc_w = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
block.c_mlp_fc_b = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" );
layer.c_mlp_proj_w = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
layer.c_mlp_proj_b = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" );
block.c_mlp_proj_w = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
block.c_mlp_proj_b = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" );
// map by name
model.tensors[blocknamestart + "attn_norm.weight"] = layer.ln_1_g;
model.tensors[blocknamestart + "attn_norm.bias"] = layer.ln_1_b;
model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g;
model.tensors[blocknamestart + "attn_norm.bias"] = block.ln_1_b;
model.tensors[blocknamestart + "attn_qkv.weight"] = layer.c_attn_attn_w;
model.tensors[blocknamestart + "attn_qkv.bias"] = layer.c_attn_attn_b;
model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w;
model.tensors[blocknamestart + "attn_qkv.bias"] = block.c_attn_attn_b;
model.tensors[blocknamestart + "attn_output.weight"] = layer.c_attn_proj_w;
model.tensors[blocknamestart + "attn_output.bias"] = layer.c_attn_proj_b;
model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w;
model.tensors[blocknamestart + "attn_output.bias"] = block.c_attn_proj_b;
model.tensors[blocknamestart + "ffn_norm.weight"] = layer.ln_2_g;
model.tensors[blocknamestart + "ffn_norm.bias"] = layer.ln_2_b;
model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g;
model.tensors[blocknamestart + "ffn_norm.bias"] = block.ln_2_b;
model.tensors[blocknamestart + "ffn_up.weight"] = layer.c_mlp_fc_w;
model.tensors[blocknamestart + "ffn_up.bias"] = layer.c_mlp_fc_b;
model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w;
model.tensors[blocknamestart + "ffn_up.bias"] = block.c_mlp_fc_b;
model.tensors[blocknamestart + "ffn_down.weight"] = layer.c_mlp_proj_w;
model.tensors[blocknamestart + "ffn_down.bias"] = layer.c_mlp_proj_b;
model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w;
model.tensors[blocknamestart + "ffn_down.bias"] = block.c_mlp_proj_b;
}
}
@ -610,10 +610,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_block = hparams.n_block;
const int n_ctx = hparams.n_ctx;
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_mem = n_block*n_ctx;
const int64_t n_elements = n_embd*n_mem;
// create the ggml context
@ -647,37 +647,23 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
// feed-forward network
ggml_tensor * gpt_neox_ff(
const gpt_neox_layer &layer,
const gpt_neox_block &block,
ggml_context * ctx0,
ggml_tensor * inp) {
ggml_tensor * cur = ggml_norm(ctx0, inp);
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, layer.ln_2_g, cur),
cur),
ggml_repeat(ctx0, layer.ln_2_b, cur));
cur = ggml_mul_mat(ctx0,
layer.c_mlp_fc_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
cur);
cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur);
// GELU activation
cur = ggml_gelu(ctx0, cur);
// projection
// cur = proj_w*cur + proj_b
cur = ggml_mul_mat(ctx0,
layer.c_mlp_proj_w,
cur);
cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
cur);
cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur);
return cur;
}
@ -701,7 +687,7 @@ bool gpt_neox_eval(
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_block = hparams.n_block;
const int n_ctx = hparams.n_ctx;
const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab;
@ -747,7 +733,7 @@ bool gpt_neox_eval(
// wte
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
for (int il = 0; il < n_layer; ++il) {
for (int il = 0; il < n_block; ++il) {
struct ggml_tensor * cur;
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
@ -758,22 +744,15 @@ bool gpt_neox_eval(
cur = ggml_norm(ctx0, inpL);
cur = ggml_add(ctx0,
ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
cur),
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur));
}
// compute QKV
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_attn_w,
cur);
cur = ggml_add(ctx0,
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
cur);
cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur);
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur);
}
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
@ -798,10 +777,7 @@ bool gpt_neox_eval(
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q =
ggml_permute(ctx0,
Qcur,
0, 2, 1, 3);
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
struct ggml_tensor * K =
@ -842,17 +818,12 @@ bool gpt_neox_eval(
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
// projection
{
cur = ggml_mul_mat(ctx0,
model.layers[il].c_attn_proj_w,
cur);
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur);
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur);
}
}
@ -861,7 +832,7 @@ bool gpt_neox_eval(
if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
cur = gpt_neox_ff(model.layers[il], ctx0, inpFF);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
// input for next layer
inpL = ggml_add(ctx0, cur, inpFF);
@ -870,7 +841,7 @@ bool gpt_neox_eval(
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
// note here we pass inpL instead of cur
cur = gpt_neox_ff(model.layers[il], ctx0, inpL);
cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
// layer input + FF
cur = ggml_add(ctx0, cur, inpFF);