mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
llama : simplify falcon Q, K, V computation
This commit is contained in:
parent
31a12f3d03
commit
a104abea48
35
llama.cpp
35
llama.cpp
@ -3886,40 +3886,17 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
// Note that the strides for Kcur, Vcur are set up so that the
|
||||
// resulting views are misaligned with the tensor's storage
|
||||
// (by applying the K/V offset we shift the tensor's original
|
||||
// view to stick out behind the viewed QKV tensor's allocated
|
||||
// memory, so to say). This is ok because no actual accesses
|
||||
// happen to that out-of-range memory, but it can require some
|
||||
// trickery when trying to accurately dump these views for
|
||||
// debugging.
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
const size_t wsize = ggml_type_size(cur->type);
|
||||
|
||||
// TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for
|
||||
// non-contiguous views is added for the rope operator
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head, n_tokens,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
0));
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
wsize * n_embd_head * n_head));
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(
|
||||
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
||||
wsize * n_embd_head,
|
||||
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
||||
wsize * n_embd_head * (n_head + n_head_kv)));
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
// using mode = 2 for neox mode
|
||||
Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
Loading…
Reference in New Issue
Block a user