diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d0fb85cea..cd943b97c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -24,25 +24,30 @@ struct naive_trie { naive_trie() : has_value(false), value(0) { } - void insert(const char * key, size_t len, int32_t value = 0) { + + void insert(const char * key, size_t len, int32_t val = 0) { if (len == 0) { - this->has_value = true; - this->value = value; + has_value = true; + value = val; + return; } + char c = key[0]; auto res = children.find(c); if (res != children.end()) { - res->second.insert(key + 1, len - 1, value); + res->second.insert(key + 1, len - 1, val); } else { auto res = children.insert(std::make_pair(c, naive_trie())); - res.first->second.insert(key + 1, len - 1, value); + res.first->second.insert(key + 1, len - 1, val); } } + std::pair get_longest_prefix(const char * key, size_t len, size_t offset = 0) const { if (len == 0 || offset == len) { return std::make_pair(key, offset); } + char c = key[offset]; auto res = children.find(c); if (res != children.end()) { @@ -51,6 +56,7 @@ struct naive_trie { return std::make_pair(key, offset); } + const struct naive_trie * traverse(const char c) const { auto res = children.find(c); if (res != children.end()) { @@ -59,6 +65,7 @@ struct naive_trie { return NULL; } + std::map children; bool has_value; llama_token value; diff --git a/src/llama.cpp b/src/llama.cpp index ed99094be..d907c2d6e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1656,10 +1656,10 @@ struct llm_build_context { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_head_i = hparams.n_head(il); - if (n_head == 0) { + if (n_head_i == 0) { // attention-free layer of Llama-3_1-Nemotron-51B cur = inpL; } else { @@ -1670,11 +1670,11 @@ struct llm_build_context { cb(cur, "attn_norm", il); } - if (n_head > 0 && n_head_kv == 0) { + if (n_head_i > 0 && n_head_kv_i == 0) { // "linear attention" of Llama-3_1-Nemotron-51B cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); cb(cur, "wo", il); - } else if (n_head > 0) { + } else if (n_head_i > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models struct ggml_tensor * rope_factors = build_rope_factors(il); @@ -1702,14 +1702,14 @@ struct llm_build_context { } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); @@ -1734,7 +1734,7 @@ struct llm_build_context { // modified to support attention-free layer of Llama-3_1-Nemotron-51B struct ggml_tensor * ffn_inp = cur; - if (n_head > 0) { + if (n_head_i > 0) { ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); } @@ -2643,7 +2643,7 @@ struct llm_build_context { // iterate layers for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur = inpL; + cur = inpL; struct ggml_tensor * Qcur; struct ggml_tensor * Kcur; @@ -4717,8 +4717,6 @@ struct llm_build_context { struct ggml_cgraph * build_gemma() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4825,8 +4823,6 @@ struct llm_build_context { struct ggml_cgraph * build_gemma2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4962,6 +4958,7 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5800,9 +5797,9 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head = hparams.n_head(il); - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_head_qkv = 2*n_head_kv + n_head; + const int64_t n_head_i = hparams.n_head(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i; cur = inpL; struct ggml_tensor * residual = cur; @@ -5818,15 +5815,15 @@ struct llm_build_context { cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens); + cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens); - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0)); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0)); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i)); cb(Kcur, "Kcur", il); - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i))); cb(Vcur, "Vcur", il); Qcur = llm_build_norm(ctx0, Qcur, hparams, @@ -5851,7 +5848,7 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens); + Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens); cb(Qcur, "Vcur", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, @@ -7495,9 +7492,9 @@ struct llm_build_context { // Token shift state dimensions should be 2 * n_emb GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2); - const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; + GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs); @@ -7608,9 +7605,9 @@ struct llm_build_context { GGML_ASSERT(n_embd == hparams.n_embd_k_s()); - const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seqs = ubatch.n_seqs; const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const int64_t n_tokens = ubatch.n_tokens; + GGML_ASSERT(n_seqs != 0); GGML_ASSERT(ubatch.equal_seqs); GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);