llama : de-shadow (wip) [no ci]

This commit is contained in:
Georgi Gerganov 2025-01-12 12:15:19 +02:00
parent 168324a388
commit 0bebe45a25
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
3 changed files with 31 additions and 29 deletions

View File

@ -204,6 +204,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
__func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
// print first 10 elements
{
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
@ -211,6 +212,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
printf("%f ", data[j]);
}
printf("\n\n");
}
// check data
if (check_data) {

View File

@ -58,12 +58,12 @@ struct llama_kv_cache {
std::vector<ggml_backend_buffer_ptr> bufs;
size_t total_size() const {
size_t size = 0;
size_t size_all = 0;
for (const auto & buf : bufs) {
size += ggml_backend_buffer_get_size(buf.get());
size_all += ggml_backend_buffer_get_size(buf.get());
}
return size;
return size_all;
}
// TODO: better data structures to reduce the cost of this operation

View File

@ -1174,14 +1174,15 @@ struct llm_build_context {
ggml_set_input(lctx.inp_K_shift);
for (int il = 0; il < n_layer; ++il) {
const int64_t n_head_kv = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_head_kv_i = hparams.n_head_kv(il);
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
struct ggml_tensor * rope_factors = build_rope_factors(il);
struct ggml_tensor * k =
ggml_view_3d(ctx0, kv_self.k_l[il],
n_embd_head_k, n_head_kv, n_ctx,
n_embd_head_k, n_head_kv_i, n_ctx,
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
0);
struct ggml_tensor * tmp;
@ -1231,18 +1232,18 @@ struct llm_build_context {
}
for (int il = 0; il < n_layer; ++il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il);
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
n_embd_k_gqa_i, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i));
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
n_embd_k_gqa, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
n_embd_k_gqa_i, nm,
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id));
ggml_tensor * view_v_src;
ggml_tensor * view_v_dst;
@ -1250,22 +1251,22 @@ struct llm_build_context {
if (flash_attn) {
// NOTE: the V cache is not transposed when using flash attention
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
n_embd_v_gqa_i, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
n_embd_v_gqa, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
n_embd_v_gqa_i, nm,
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id));
} else {
view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
nm, n_embd_v_gqa_i,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, i));
view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
nm, n_embd_v_gqa,
nm, n_embd_v_gqa_i,
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
ggml_row_size(kv_self.v_l[il]->type, id));
}
@ -1459,7 +1460,6 @@ struct llm_build_context {
}
struct ggml_tensor * llm_build_inp_embd_enc() {
const int64_t n_embd = hparams.n_embd;
lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
ggml_set_input(lctx.inp_embd_enc);
cb(lctx.inp_embd_enc, "embd_enc", -1);