mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 04:24:30 +00:00
llama : make starcoder graph build more consistent with others
This commit is contained in:
parent
f82328ab65
commit
92a4f86879
98
llama.cpp
98
llama.cpp
@ -3446,7 +3446,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|||||||
const int64_t n_layer = hparams.n_layer;
|
const int64_t n_layer = hparams.n_layer;
|
||||||
const int64_t n_ctx = hparams.n_ctx;
|
const int64_t n_ctx = hparams.n_ctx;
|
||||||
const int64_t n_head = hparams.n_head;
|
const int64_t n_head = hparams.n_head;
|
||||||
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
const int64_t n_embd_head = hparams.n_embd_head();
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
@ -3508,28 +3510,44 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|||||||
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||||
|
|
||||||
inpL = ggml_add(ctx0, token, position);
|
inpL = ggml_add(ctx0, token, position);
|
||||||
|
ggml_set_name(inpL, "inpL");
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
{
|
{
|
||||||
// Norm
|
// Norm
|
||||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||||
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// Self Attention
|
// Self Attention
|
||||||
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||||
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
||||||
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
||||||
|
|
||||||
// store key and value to memory
|
struct ggml_tensor * Qcur = tmpq;
|
||||||
if (N >= 1) {
|
struct ggml_tensor * Kcur = tmpk;
|
||||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
|
||||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
{
|
||||||
|
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
||||||
|
ggml_set_name(Vcur, "Vcur");
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
||||||
|
ggml_set_name(k, "k");
|
||||||
|
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
||||||
|
( n_ctx)*ggml_element_size(kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
@ -3541,56 +3559,62 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|||||||
Qcur,
|
Qcur,
|
||||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||||
0, 2, 1, 3);
|
0, 2, 1, 3);
|
||||||
|
ggml_set_name(Q, "Q");
|
||||||
|
|
||||||
struct ggml_tensor * K =
|
struct ggml_tensor * K =
|
||||||
ggml_permute(ctx0,
|
ggml_view_3d(ctx0, kv_self.k,
|
||||||
ggml_reshape_3d(ctx0,
|
n_embd_head, n_past + N, n_head_kv,
|
||||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||||
n_embd/n_head, n_head, n_past + N),
|
ggml_element_size(kv_self.k)*n_embd_head,
|
||||||
0, 2, 1, 3); //TODO: need to be tiled
|
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||||
|
ggml_set_name(K, "K");
|
||||||
|
|
||||||
// K * Q
|
// K * Q
|
||||||
// [n_past + N, N, 12]
|
|
||||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
// [n_past + N, N, 12]
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
ggml_scale_inplace(ctx0,
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
KQ,
|
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
// [n_past + N, N, 12]
|
|
||||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
// [n_past + N, N, 12]
|
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
// split cached V into n_head heads
|
||||||
// [n_past + N, 64, 12]
|
struct ggml_tensor * V =
|
||||||
struct ggml_tensor * V_trans =
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
ggml_cpy(ctx0,
|
n_past + N, n_embd_head, n_head_kv,
|
||||||
ggml_permute(ctx0,
|
ggml_element_size(kv_self.v)*n_ctx,
|
||||||
ggml_reshape_3d(ctx0,
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||||
n_embd/n_head, n_head, n_past + N),
|
ggml_set_name(V, "V");
|
||||||
1, 2, 0, 3),
|
|
||||||
ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
|
||||||
|
|
||||||
// KQV = transpose(V) * KQ_soft_max
|
#if 1
|
||||||
// [64, N, 12]
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
ggml_set_name(KQV, "KQV");
|
||||||
|
#else
|
||||||
|
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
||||||
|
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
||||||
|
// is there a better way?
|
||||||
|
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
||||||
|
#endif
|
||||||
|
|
||||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
// [64, 12, N]
|
|
||||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
ggml_set_name(KQV_merged, "KQV_merged");
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
cur = ggml_cpy(ctx0,
|
cur = ggml_cpy(ctx0,
|
||||||
KQV_merged,
|
KQV_merged,
|
||||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Projection
|
// Projection
|
||||||
|
Loading…
Reference in New Issue
Block a user