llama : de-shadow (cont) [no ci]

This commit is contained in:
Georgi Gerganov 2025-01-12 12:30:54 +02:00
parent 0127774ae4
commit 32e7b9dc99
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 34 additions and 30 deletions

View File

@ -24,25 +24,30 @@
struct naive_trie {
naive_trie() : has_value(false), value(0) {
}
void insert(const char * key, size_t len, int32_t value = 0) {
void insert(const char * key, size_t len, int32_t val = 0) {
if (len == 0) {
this->has_value = true;
this->value = value;
has_value = true;
value = val;
return;
}
char c = key[0];
auto res = children.find(c);
if (res != children.end()) {
res->second.insert(key + 1, len - 1, value);
res->second.insert(key + 1, len - 1, val);
} else {
auto res = children.insert(std::make_pair(c, naive_trie()));
res.first->second.insert(key + 1, len - 1, value);
res.first->second.insert(key + 1, len - 1, val);
}
}
std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
if (len == 0 || offset == len) {
return std::make_pair(key, offset);
}
char c = key[offset];
auto res = children.find(c);
if (res != children.end()) {
@ -51,6 +56,7 @@ struct naive_trie {
return std::make_pair(key, offset);
}
const struct naive_trie * traverse(const char c) const {
auto res = children.find(c);
if (res != children.end()) {
@ -59,6 +65,7 @@ struct naive_trie {
return NULL;
}
std::map<char, struct naive_trie> children;
bool has_value;
llama_token value;

View File

@ -1656,10 +1656,10 @@ struct llm_build_context {
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_head = hparams.n_head(il);
const int64_t n_head_kv_i = hparams.n_head_kv(il);
const int64_t n_head_i = hparams.n_head(il);
if (n_head == 0) {
if (n_head_i == 0) {
// attention-free layer of Llama-3_1-Nemotron-51B
cur = inpL;
} else {
@ -1670,11 +1670,11 @@ struct llm_build_context {
cb(cur, "attn_norm", il);
}
if (n_head > 0 && n_head_kv == 0) {
if (n_head_i > 0 && n_head_kv_i == 0) {
// "linear attention" of Llama-3_1-Nemotron-51B
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
cb(cur, "wo", il);
} else if (n_head > 0) {
} else if (n_head_i > 0) {
// self-attention
// rope freq factors for llama3; may return nullptr for llama2 and other models
struct ggml_tensor * rope_factors = build_rope_factors(il);
@ -1702,14 +1702,14 @@ struct llm_build_context {
}
Qcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
@ -1734,7 +1734,7 @@ struct llm_build_context {
// modified to support attention-free layer of Llama-3_1-Nemotron-51B
struct ggml_tensor * ffn_inp = cur;
if (n_head > 0) {
if (n_head_i > 0) {
ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
}
@ -2643,7 +2643,7 @@ struct llm_build_context {
// iterate layers
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur = inpL;
cur = inpL;
struct ggml_tensor * Qcur;
struct ggml_tensor * Kcur;
@ -4717,8 +4717,6 @@ struct llm_build_context {
struct ggml_cgraph * build_gemma() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@ -4825,8 +4823,6 @@ struct llm_build_context {
struct ggml_cgraph * build_gemma2() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head_k = hparams.n_embd_head_k;
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
@ -4962,6 +4958,7 @@ struct llm_build_context {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
GGML_ASSERT(n_embd_head == hparams.n_rot);
@ -5800,9 +5797,9 @@ struct llm_build_context {
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
for (int il = 0; il < n_layer; ++il) {
const int64_t n_head = hparams.n_head(il);
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_head_qkv = 2*n_head_kv + n_head;
const int64_t n_head_i = hparams.n_head(il);
const int64_t n_head_kv_i = hparams.n_head_kv(il);
const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
cur = inpL;
struct ggml_tensor * residual = cur;
@ -5818,15 +5815,15 @@ struct llm_build_context {
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
cb(Qcur, "Qcur", il);
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
cb(Kcur, "Kcur", il);
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
cb(Vcur, "Vcur", il);
Qcur = llm_build_norm(ctx0, Qcur, hparams,
@ -5851,7 +5848,7 @@ struct llm_build_context {
);
cb(Kcur, "Kcur", il);
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
cb(Qcur, "Vcur", il);
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@ -7495,9 +7492,9 @@ struct llm_build_context {
// Token shift state dimensions should be 2 * n_emb
GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs);
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
@ -7608,9 +7605,9 @@ struct llm_build_context {
GGML_ASSERT(n_embd == hparams.n_embd_k_s());
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seqs = ubatch.n_seqs;
const int64_t n_seq_tokens = ubatch.n_seq_tokens;
const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(n_seqs != 0);
GGML_ASSERT(ubatch.equal_seqs);
GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);