mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-31 22:04:35 +00:00
llama : rethink recurrent state cell counts
* llama : begin work on support for variable GQA This will also be useful for Jamba if we consider the Mamba layers to have 0 KV heads. * llama : gracefully fail when not finding hybrid slot
This commit is contained in:
parent
3b57b55c6f
commit
7e13f19fb5
366
llama.cpp
366
llama.cpp
@ -1753,6 +1753,9 @@ struct llama_hparams {
|
||||
uint32_t n_expert_used = 0;
|
||||
uint32_t n_vocab_type = 0; // for BERT-style token types
|
||||
|
||||
// TODO: find a more compact way to add more per-layer hyper-parameters
|
||||
std::vector<int32_t> n_head_kv_vec;
|
||||
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
@ -1793,6 +1796,8 @@ struct llama_hparams {
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
|
||||
if (this->n_head_kv_vec != other.n_head_kv_vec) return true;
|
||||
|
||||
if (this->rope_finetuned != other.rope_finetuned) return true;
|
||||
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
|
||||
|
||||
@ -1812,29 +1817,46 @@ struct llama_hparams {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t n_gqa() const {
|
||||
uint32_t n_head_kv_l(uint32_t layer) const {
|
||||
if (layer < n_head_kv_vec.size()) {
|
||||
int32_t n_hkv_l = n_head_kv_vec[layer];
|
||||
// TODO: what should happen when it's negative?
|
||||
GGML_ASSERT(n_hkv_l >= 0);
|
||||
return n_hkv_l;
|
||||
}
|
||||
return n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_gqa(uint32_t layer = 0) const {
|
||||
uint32_t n_head_kv = n_head_kv_l(layer);
|
||||
if (n_head_kv == 0) {
|
||||
return 0;
|
||||
}
|
||||
return n_head/n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_k_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
uint32_t n_embd_k_gqa(uint32_t layer = 0) const { // dimension of key embeddings across all k-v heads
|
||||
uint32_t n_head_kv = n_head_kv_l(layer);
|
||||
return n_embd_head_k * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
|
||||
uint32_t n_embd_v_gqa(uint32_t layer = 0) const { // dimension of value embeddings across all k-v heads
|
||||
uint32_t n_head_kv = n_head_kv_l(layer);
|
||||
return n_embd_head_v * n_head_kv;
|
||||
}
|
||||
|
||||
uint32_t n_embd_r() const { // dimension of the rolling state embeddings
|
||||
uint32_t n_embd_r(uint32_t layer) const { // dimension of the rolling state embeddings
|
||||
// TODO: support using an SSM in place of the MLP of a Transformer
|
||||
if (n_head_kv_l(layer) != 0) { return 0; }
|
||||
// corresponds to Mamba's conv_states size
|
||||
// TODO: maybe support other convolution strides than 1
|
||||
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
||||
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
||||
}
|
||||
|
||||
uint32_t n_embd_s() const { // dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_s(uint32_t layer) const { // dimension of the recurrent state embeddings
|
||||
// TODO: support using an SSM in place of the MLP of a Transformer
|
||||
if (n_head_kv_l(layer) != 0) { return 0; }
|
||||
// corresponds to Mamba's ssm_states size
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
@ -2078,10 +2100,12 @@ struct llama_rs_cache {
|
||||
// computed when finding a slot
|
||||
uint32_t n = 0; // range of states used for the last slot
|
||||
|
||||
// useful to know the minimum reserved cell count per seq_id
|
||||
// only counts sequences which have a non-shared tail
|
||||
// only counts cells which are tails of all of their sequences.
|
||||
// useful to know the minimum reserved cell count per seq_id.
|
||||
uint32_t n_seqs = 0;
|
||||
// cells part of multiple sequences AND which have at least one tail
|
||||
// cells part of multiple sequences,
|
||||
// but which are only the tail of some of them.
|
||||
// useful to dismiss sequences used as a shared prompt
|
||||
uint32_t n_shared_tail_cells = 0;
|
||||
|
||||
// with state models, a cell can hold the state for more than one past token
|
||||
@ -2279,10 +2303,8 @@ struct llama_rs_cache {
|
||||
for (uint32_t cell_id = 0; (uint32_t) cell_id < size; ++cell_id) {
|
||||
llama_rs_cell & rs_cell = cells[cell_id];
|
||||
if (!rs_cell.seq_nodes.empty()) {
|
||||
if (rs_cell.seq_nodes.size() == 1) {
|
||||
if (rs_cell.tail_rc == 1) {
|
||||
if (rs_cell.seq_nodes.size() == rs_cell.tail_rc) {
|
||||
n_seqs_verif += 1;
|
||||
}
|
||||
} else if (rs_cell.tail_rc > 0) {
|
||||
n_shared_tail_cells_verif += 1;
|
||||
}
|
||||
@ -2308,9 +2330,11 @@ struct llama_rs_cache {
|
||||
}
|
||||
|
||||
// returns an iterator to the seq_node after the removed one, or the same which was passed if it wasn't removed.
|
||||
// Why an iterator? Because it allows using std::vector<T>::erase.
|
||||
std::vector<llama_rs_seq_node>::iterator remove_seq_node_from_cell(llama_rs_cell & rs_cell, std::vector<llama_rs_seq_node>::iterator node_iter) {
|
||||
GGML_ASSERT(&rs_cell >= cells.data() && &rs_cell < cells.data() + cells.size());
|
||||
// TODO: assert the iterator points inside the correct vector
|
||||
// The iterator needs to point inside the correct vector
|
||||
GGML_ASSERT(node_iter.base() >= rs_cell.seq_nodes.data() && node_iter.base() < rs_cell.seq_nodes.data() + rs_cell.seq_nodes.size());
|
||||
if (node_iter != rs_cell.seq_nodes.end()) {
|
||||
// update the tree
|
||||
llama_rs_seq_node node = *node_iter;
|
||||
@ -2325,14 +2349,22 @@ struct llama_rs_cache {
|
||||
GGML_ASSERT(prev_node != prev_cell.seq_nodes.end());
|
||||
prev_node->next_cell = node.next_cell;
|
||||
if (node.is_tail()) {
|
||||
// move the tail back to the previous cell
|
||||
if (prev_cell.seq_nodes.size() > 1) {
|
||||
if (rs_cell.tail_rc == rs_cell.seq_nodes.size()) {
|
||||
if (prev_cell.tail_rc == 0) {
|
||||
n_shared_tail_cells += 1;
|
||||
}
|
||||
if (rs_cell.seq_nodes.size() == 1) {
|
||||
|
||||
// o oo oo
|
||||
// |/ -> o/
|
||||
// | |
|
||||
// e.g. when removing the leaf with a single tail
|
||||
if (rs_cell.tail_rc == 1 && prev_cell.tail_rc != prev_cell.seq_nodes.size()) {
|
||||
n_seqs -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_cell.tail_rc += 1;
|
||||
}
|
||||
}
|
||||
@ -2341,17 +2373,22 @@ struct llama_rs_cache {
|
||||
if (node.is_tail()) {
|
||||
seq.tail = rs_cell.prev;
|
||||
if (rs_cell.tail_rc == 1) {
|
||||
if (rs_cell.seq_nodes.size() > 1) {
|
||||
// assuming the previous cell of a shared cell is also shared,
|
||||
// this was a shared tail cell, but will no longer be a tail cell
|
||||
n_shared_tail_cells -= 1;
|
||||
} else if (seq.tail < 0) {
|
||||
if (seq.tail < 0) {
|
||||
// no more tail, no more sequence
|
||||
if (rs_cell.seq_nodes.size() > 1) {
|
||||
n_shared_tail_cells -= 1;
|
||||
} else {
|
||||
n_seqs -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(rs_cell.tail_rc > 0);
|
||||
rs_cell.tail_rc -= 1;
|
||||
} else if (rs_cell.tail_rc == rs_cell.seq_nodes.size() - 1) {
|
||||
// will fully become a tail cell
|
||||
if (rs_cell.tail_rc > 0) {
|
||||
n_seqs += 1;
|
||||
}
|
||||
}
|
||||
if (node_iter == rs_cell.seq_nodes.begin()) {
|
||||
// this seq_id was the first in the list
|
||||
@ -2363,14 +2400,6 @@ struct llama_rs_cache {
|
||||
if ((uint32_t) next_node->seq_id < seq_tails.size()) {
|
||||
auto & next_seq = seq_tails[next_node->seq_id];
|
||||
next_seq.n_cells += 1;
|
||||
// only the tail ref count from the other seq_ids are left in tail_rc
|
||||
if (rs_cell.tail_rc > 0) {
|
||||
// will become a non-shared cell
|
||||
if (rs_cell.seq_nodes.size() == 2) {
|
||||
n_shared_tail_cells -= 1;
|
||||
n_seqs += 1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(false && "invalid seq_id");
|
||||
}
|
||||
@ -2433,44 +2462,42 @@ struct llama_rs_cache {
|
||||
rs_cell.pos = prev_cell.pos + 1;
|
||||
rs_cell.src = prev_cell.src;
|
||||
}
|
||||
prev_cell.tail_rc -= 1;
|
||||
prev_node->next_cell = i_cell;
|
||||
rs_cell.prev = prev;
|
||||
if (seq.tail == prev) {
|
||||
// What to do when the tail moves...
|
||||
// from unique to shared (n_seqs--)
|
||||
// if the new cell has one seq_id or has no tails (n_shared_tail_cells++)
|
||||
// if the new cell has one seq_id and a tail (n_seqs-- (yes, another time))
|
||||
// from unique to unique (seq.n_cells++)
|
||||
// from empty to unique (seq.n_cells++, n_seqs++)
|
||||
// from empty to shared
|
||||
// if the new cell only has one seq_id or has no tail (n_shared_tail_cells++)
|
||||
// if the new cell only has one seq_id and has one tail (n_seqs--)
|
||||
// from shared to shared
|
||||
// if the last cell has no tails (n_shared_tail_cells--)
|
||||
// if the new cell has no tails or has one seq_id (n_shared_tail_cells++)
|
||||
// if the new cell only has one seq_id and has one tail (n_seqs--)
|
||||
// from shared to unique (seq.n_cells++)
|
||||
// if this seq_id was not the first of the last cell (n_seqs++)
|
||||
// if the last cell has no tails (n_shared_tail_cells--)
|
||||
if (prev_cell.seq_nodes.size() > 1) {
|
||||
// from shared
|
||||
if (rs_cell.is_empty()) {
|
||||
// to unique
|
||||
if (prev_cell.seq_nodes[0].seq_id != id) {
|
||||
n_seqs += 1;
|
||||
// (Legend: tail: O, one or more non-tails: o, one or more tails O+, empty: _)
|
||||
// O -> oO (n_seqs--, n_shared_tail_cells++)
|
||||
// O -> O (seq.n_cells++)
|
||||
// OO+ -> oO (n_seqs--, n_shared_tail_cells += 2)
|
||||
// OO+ -> O+ (n_shared_tail_cells++ (the previous cell becomes oO+))
|
||||
// _ -> oO (n_shared_tail_cells++)
|
||||
// _ -> O (seq.n_cells++, n_seqs++)
|
||||
// Oo -> O (seq.n_cells++, n_seqs++, n_shared_tail_cell--)
|
||||
// Oo -> OO+ (n_shared_tail_cell--)
|
||||
// OOo -> O (seq.n_cells++, n_seqs++)
|
||||
if (prev_cell.seq_nodes.size() == prev_cell.tail_rc) {
|
||||
// from fully tail
|
||||
if (prev_cell.tail_rc > 1) {
|
||||
// the previous tail becomes shared with a non-tail
|
||||
n_shared_tail_cells += 1;
|
||||
}
|
||||
}
|
||||
// the previous cell is no longer a shared tail
|
||||
if (prev_cell.tail_rc == 0) {
|
||||
n_shared_tail_cells -= 1;
|
||||
}
|
||||
} else if (!rs_cell.is_empty()) {
|
||||
// from unique to shared
|
||||
if (!rs_cell.is_empty() && rs_cell.tail_rc == 0) {
|
||||
// the new tail cell was previously a fully non-tail cell
|
||||
n_shared_tail_cells += 1;
|
||||
n_seqs -= 1;
|
||||
}
|
||||
} else if (rs_cell.is_empty()) {
|
||||
// from shared to unique
|
||||
n_seqs += 1;
|
||||
if (prev_cell.tail_rc == 1) {
|
||||
// it was the last tail of the previous cell
|
||||
n_shared_tail_cells -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_cell.tail_rc -= 1;
|
||||
}
|
||||
if (rs_cell.is_empty()) {
|
||||
// to unique
|
||||
seq.n_cells += 1;
|
||||
@ -2482,15 +2509,10 @@ struct llama_rs_cache {
|
||||
rs_cell.src = -1;
|
||||
}
|
||||
used += 1;
|
||||
} else {
|
||||
// to shared
|
||||
if (rs_cell.seq_nodes.size() == 1) {
|
||||
// a lone tail becomes a shared cell
|
||||
if (rs_cell.tail_rc > 0) {
|
||||
n_seqs -= 1;
|
||||
}
|
||||
n_shared_tail_cells += 1;
|
||||
} else if (rs_cell.tail_rc == 0) {
|
||||
// to shared
|
||||
if (seq.tail < 0) {
|
||||
// from empty to shared
|
||||
n_shared_tail_cells += 1;
|
||||
}
|
||||
}
|
||||
@ -2910,26 +2932,18 @@ static bool llama_cache_init(
|
||||
const llama_context * ctx,
|
||||
ggml_type type_k,
|
||||
ggml_type type_v,
|
||||
uint32_t n_ctx,
|
||||
uint32_t n_seq_max,
|
||||
bool offload) {
|
||||
const llama_model & model = ctx->model;
|
||||
const llama_cparams & cparams = ctx->cparams;
|
||||
|
||||
const struct llama_hparams & hparams = model.hparams;
|
||||
|
||||
// TODO: per layer n_embd_*
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const uint32_t n_embd_r = hparams.n_embd_r();
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
const bool has_kv = hparams.n_head != 0 && hparams.causal_attn;
|
||||
const bool has_r = n_embd_r != 0;
|
||||
const bool has_s = n_embd_s != 0;
|
||||
const bool has_kv = hparams.n_head_kv != 0 && hparams.causal_attn;
|
||||
const bool has_r = hparams.ssm_d_conv != 0 && hparams.ssm_d_inner != 0;
|
||||
const bool has_s = hparams.ssm_d_state != 0 && hparams.ssm_d_inner != 0;
|
||||
const bool has_rs = has_r || has_s;
|
||||
const uint32_t kv_size = has_kv ? n_ctx : 0;
|
||||
const uint32_t rs_size = has_rs ? n_seq_max : 0;
|
||||
// TODO: per cache type layer count
|
||||
const uint32_t kv_size = has_kv ? cparams.n_ctx : 0;
|
||||
const uint32_t rs_size = has_rs ? cparams.n_seq_max : 0;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
|
||||
cache.kv.size = kv_size;
|
||||
@ -2967,6 +2981,7 @@ static bool llama_cache_init(
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
int n_layers = it.second;
|
||||
// TODO: for mixed architectures, avoid allocating empty recurrent state or kv cache tensors
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ (2*has_kv + has_r+has_s)*n_layers*ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
@ -2995,20 +3010,20 @@ static bool llama_cache_init(
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
if (has_kv) {
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, hparams.n_embd_k_gqa(i)*kv_size);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, hparams.n_embd_v_gqa(i)*kv_size);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.kv.k_l.push_back(k);
|
||||
cache.kv.v_l.push_back(v);
|
||||
}
|
||||
if (has_r) {
|
||||
ggml_tensor * r = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_r*rs_size);
|
||||
ggml_tensor * r = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_r(i)*rs_size);
|
||||
ggml_format_name(r, "cache_r_l%d", i);
|
||||
cache.rs.r_l.push_back(r);
|
||||
}
|
||||
if (has_s) {
|
||||
ggml_tensor * s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd_s*rs_size);
|
||||
ggml_tensor * s = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd_s(i)*rs_size);
|
||||
ggml_format_name(s, "cache_s_l%d", i);
|
||||
cache.rs.s_l.push_back(s);
|
||||
}
|
||||
@ -3024,7 +3039,7 @@ static bool llama_cache_init(
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
LLAMA_LOG_INFO("%s: %10s ctx buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: %10s cache buf size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
cache.bufs.push_back(buf);
|
||||
}
|
||||
|
||||
@ -3042,7 +3057,68 @@ static bool llama_cache_find_slot(
|
||||
const uint32_t rs_size = cache.rs.size;
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
|
||||
// FIXME: on failure, leave all caches in a consistent state.
|
||||
// only check first, to allow failing gracefully
|
||||
if (rs_size > 0) {
|
||||
// everything should fit if all seq_ids are smaller than the max
|
||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||
int32_t n_seq_id = batch.n_seq_id[i];
|
||||
for (int32_t j = 0; j < n_seq_id; ++j) {
|
||||
llama_seq_id seq_id = batch.seq_id[i][j];
|
||||
|
||||
if (seq_id < 0 || (uint32_t) seq_id >= rs_size) {
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the rs cache size instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.rs.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (kv_size > 0) {
|
||||
// one KV cell per token
|
||||
if (n_tokens > kv_size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, kv_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (cache.kv.head > cache.kv.used + 2*n_tokens) {
|
||||
cache.kv.head = 0;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (cache.kv.head + n_tokens > kv_size) {
|
||||
n_tested += kv_size - cache.kv.head;
|
||||
cache.kv.head = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
if (cache.kv.cells[cache.kv.head + i].pos >= 0) {
|
||||
found = false;
|
||||
cache.kv.head += i + 1;
|
||||
n_tested += i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= kv_size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// now modification can be done, and should NOT fail
|
||||
|
||||
if (rs_size > 0) {
|
||||
// For recurrent state architectures (like Mamba),
|
||||
@ -3061,7 +3137,7 @@ static bool llama_cache_find_slot(
|
||||
llama_seq_id seq_id = batch.seq_id[i][j];
|
||||
bool need_new_cell = false;
|
||||
// Everything should fit assuming the biggest seq_id < rs_size
|
||||
if ((uint32_t) seq_id < rs_size) {
|
||||
GGML_ASSERT((uint32_t) seq_id < rs_size);
|
||||
llama_rs_seq_meta & seq = cache.rs.seq_tails[seq_id];
|
||||
if (seq_id > max_seq) { max_seq = seq_id; }
|
||||
if (seq_id < min_seq) { min_seq = seq_id; }
|
||||
@ -3071,6 +3147,7 @@ static bool llama_cache_find_slot(
|
||||
// but there's already a cell reserved for this token, use it
|
||||
cache.rs.insert_seq_tail_to_cell_id(target_cell, seq_id);
|
||||
} else if (seq.tail < 0) {
|
||||
// this seq_id has no tail (and is empty)
|
||||
need_new_cell = true;
|
||||
} else {
|
||||
llama_rs_cell & tail = cache.rs.cells[seq.tail];
|
||||
@ -3108,6 +3185,7 @@ static bool llama_cache_find_slot(
|
||||
}
|
||||
}
|
||||
|
||||
// reserve a cell for this seq_id
|
||||
if (need_new_cell && target_cell < 0) {
|
||||
const int32_t min_cells_per_seq = cache.rs.min_cells_per_seq(seq);
|
||||
|
||||
@ -3117,12 +3195,9 @@ static bool llama_cache_find_slot(
|
||||
while (true) {
|
||||
if (cache.rs.head >= cache.rs.size) {
|
||||
cache.rs.head = 0;
|
||||
if (looped_once) {
|
||||
// avoid infinite loop
|
||||
// NOTE: this should not happen, but gracefully fail anyway
|
||||
LLAMA_LOG_ERROR("%s: recurrent state cache seems full, but should not. This is a bug.\n", __func__);
|
||||
return false;
|
||||
}
|
||||
// NOTE: this should not fail; if it does, it's a bug.
|
||||
GGML_ASSERT(!looped_once && "recurrent state cache seems full, but should not.");
|
||||
looped_once = true;
|
||||
}
|
||||
cell_id = cache.rs.head;
|
||||
@ -3191,12 +3266,6 @@ static bool llama_cache_find_slot(
|
||||
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
|
||||
__func__, batch.pos[i], cache.rs.cells[cache.rs.head].pos - 1, seq_id);
|
||||
}
|
||||
} else {
|
||||
// too big seq_id
|
||||
// TODO: would it be possible to resize the rs cache size instead?
|
||||
LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.rs.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
cache.rs.head = target_cell + 1;
|
||||
}
|
||||
@ -3211,53 +3280,10 @@ static bool llama_cache_find_slot(
|
||||
cache.rs.n = max_cell - min_cell + 1;
|
||||
|
||||
// sanity check
|
||||
if (max_seq < min_seq || max_cell < min_cell) {
|
||||
return false;
|
||||
}
|
||||
GGML_ASSERT(min_seq <= max_seq && min_cell <= max_cell);
|
||||
}
|
||||
|
||||
if (kv_size > 0) {
|
||||
// one KV cell per token
|
||||
if (n_tokens > kv_size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, kv_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (cache.kv.head > cache.kv.used + 2*n_tokens) {
|
||||
cache.kv.head = 0;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (cache.kv.head + n_tokens > kv_size) {
|
||||
n_tested += kv_size - cache.kv.head;
|
||||
cache.kv.head = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
bool found = true;
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
if (cache.kv.cells[cache.kv.head + i].pos >= 0) {
|
||||
found = false;
|
||||
cache.kv.head += i + 1;
|
||||
n_tested += i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= kv_size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
cache.kv.cells[cache.kv.head + i].pos = batch.pos[i];
|
||||
|
||||
@ -4194,9 +4220,9 @@ struct llama_model_loader {
|
||||
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
||||
const int kid = gguf_find_key(meta, key.c_str());
|
||||
|
||||
if (kid < 0) {
|
||||
if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
|
||||
if (required) {
|
||||
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
||||
throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -4204,16 +4230,17 @@ struct llama_model_loader {
|
||||
struct GGUFMeta::ArrayInfo arr_info =
|
||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
||||
|
||||
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
||||
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
||||
// TODO: allow ANY lossless cast
|
||||
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
||||
switch (arr_info.gt) {
|
||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value)); break;
|
||||
default:
|
||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
||||
}
|
||||
|
||||
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
||||
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
||||
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
||||
|
||||
result.resize(arr_info.length);
|
||||
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
||||
result.reserve(arr_info.length);
|
||||
result.assign((const T *)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -4750,7 +4777,12 @@ static void llm_load_hparams(
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv = hparams.n_head;
|
||||
|
||||
// per-layer n_head_kv
|
||||
if (!ml.get_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_vec, false)) {
|
||||
// global/fallback n_head_kv
|
||||
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
|
||||
}
|
||||
|
||||
bool rope_finetuned = false;
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
||||
@ -6704,10 +6736,7 @@ static bool llm_load_tensors(
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||
|
||||
const int64_t n_ff = hparams.n_ff;
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
@ -7198,8 +7227,8 @@ static void llm_build_kv_store(
|
||||
int64_t il) {
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
GGML_ASSERT(kv.size == n_ctx);
|
||||
|
||||
@ -7465,9 +7494,9 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
int il) {
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_head_kv = hparams.n_head_kv_l(il);
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
@ -7619,9 +7648,7 @@ struct llm_build_context {
|
||||
const int64_t n_head;
|
||||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa;
|
||||
const int64_t n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa;
|
||||
const int64_t n_expert;
|
||||
const int64_t n_expert_used;
|
||||
|
||||
@ -7673,9 +7700,7 @@ struct llm_build_context {
|
||||
n_head (hparams.n_head),
|
||||
n_head_kv (hparams.n_head_kv),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
n_embd_k_gqa (hparams.n_embd_k_gqa()),
|
||||
n_embd_head_v (hparams.n_embd_head_v),
|
||||
n_embd_v_gqa (hparams.n_embd_v_gqa()),
|
||||
n_expert (hparams.n_expert),
|
||||
n_expert_used (hparams.n_expert_used),
|
||||
freq_base (cparams.rope_freq_base),
|
||||
@ -7746,9 +7771,9 @@ struct llm_build_context {
|
||||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_ext_inplace(ctx0,
|
||||
ggml_view_3d(ctx0, kv_self.k_l[il],
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
n_embd_head_k, hparams.n_head_kv_l(il), n_ctx,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv_self.k_l[il]->type, hparams.n_embd_k_gqa(il)),
|
||||
0),
|
||||
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
@ -7777,6 +7802,9 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
||||
|
||||
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
n_embd_k_gqa, nm,
|
||||
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
||||
@ -11014,8 +11042,8 @@ struct llm_build_context {
|
||||
struct ggml_tensor * state_seq = build_inp_s_seq();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, rs_self.r_l[il], hparams.n_embd_r(), rs_self.size);
|
||||
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, rs_self.s_l[il], hparams.n_embd_s(), rs_self.size);
|
||||
struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, rs_self.r_l[il], hparams.n_embd_r(il), rs_self.size);
|
||||
struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, rs_self.s_l[il], hparams.n_embd_s(il), rs_self.size);
|
||||
|
||||
// copy states
|
||||
{
|
||||
@ -16452,7 +16480,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
ctx->backends.push_back(ctx->backend_cpu);
|
||||
|
||||
if (!llama_cache_init(ctx->cache, ctx, type_k, type_v, cparams.n_ctx, cparams.n_seq_max, cparams.offload_kqv)) {
|
||||
if (!llama_cache_init(ctx->cache, ctx, type_k, type_v, cparams.offload_kqv)) {
|
||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
@ -17282,7 +17310,7 @@ static void llama_state_get_data_internal(struct llama_context * ctx, llama_data
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); // FIXME: per layer
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
// NOTE: kv_size and kv_buf_size are mostly used for sanity checks
|
||||
@ -17434,7 +17462,7 @@ size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); // FIXME: per layer
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
size_t kv_buf_size;
|
||||
@ -17627,7 +17655,7 @@ size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id)
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); // FIXME: per layer
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
||||
@ -17713,7 +17741,7 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
||||
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); // FIXME: per layer
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
// Write the layer count
|
||||
@ -17843,7 +17871,7 @@ size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src,
|
||||
// Sanity check model compatibility
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); // FIXME: per layer
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
if (n_layer != n_layer_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref);
|
||||
|
Loading…
Reference in New Issue
Block a user