mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-14 04:24:30 +00:00
correct tensor patch
This commit is contained in:
parent
e9d7b6c05f
commit
4e28ad40a0
@ -19339,7 +19339,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||||||
|
|
||||||
fprintf(fp, "digraph G {\n");
|
fprintf(fp, "digraph G {\n");
|
||||||
fprintf(fp, " newrank = true;\n");
|
fprintf(fp, " newrank = true;\n");
|
||||||
fprintf(fp, " rankdir = LR;\n");
|
fprintf(fp, " rankdir = TB;\n");
|
||||||
|
|
||||||
for (int i = 0; i < gb->n_nodes; i++) {
|
for (int i = 0; i < gb->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = gb->nodes[i];
|
struct ggml_tensor * node = gb->nodes[i];
|
||||||
@ -19401,7 +19401,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||||||
}
|
}
|
||||||
|
|
||||||
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
||||||
if (ggml_nelements(node) < 5) {
|
if (ggml_nelements(node) < 5 && node->data != NULL) {
|
||||||
fprintf(fp, " | (");
|
fprintf(fp, " | (");
|
||||||
for (int j = 0; j < ggml_nelements(node); j++) {
|
for (int j = 0; j < ggml_nelements(node); j++) {
|
||||||
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
||||||
|
@ -18314,7 +18314,7 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
if (il == 0) n_tensors_per_layer++;
|
if (il == 0) n_tensors_per_layer++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
|
// printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
|
||||||
|
|
||||||
// count layer buffer types
|
// count layer buffer types
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
|
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
|
||||||
@ -18363,6 +18363,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
} else {
|
} else {
|
||||||
ab_map[name].b = cur;
|
ab_map[name].b = cur;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cur->name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -18400,14 +18402,14 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
adapter.bufs.reserve(ctx_map.size());
|
adapter.bufs.reserve(ctx_map.size());
|
||||||
for (auto it : ctx_map) {
|
for (auto it : ctx_map) {
|
||||||
ggml_backend_buffer_type_t buft = it.first;
|
ggml_backend_buffer_type_t buft = it.first;
|
||||||
ggml_context * ctx = it.second;
|
ggml_context * ctx_dev = it.second;
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
|
||||||
if (!buf) {
|
if (!buf) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buf, 0);
|
ggml_backend_buffer_clear(buf, 0);
|
||||||
adapter.ctxs.push_back(ctx);
|
adapter.ctxs.push_back(ctx_dev);
|
||||||
adapter.bufs.push_back(buf);
|
adapter.bufs.push_back(buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -18424,8 +18426,8 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
}
|
}
|
||||||
gguf_file.seek(offs, SEEK_SET);
|
gguf_file.seek(offs, SEEK_SET);
|
||||||
gguf_file.read_raw(read_buf.data(), size);
|
gguf_file.read_raw(read_buf.data(), size);
|
||||||
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size);
|
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, dev->name, size);
|
||||||
return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
||||||
};
|
};
|
||||||
for (auto & it : adapter.ab_map) {
|
for (auto & it : adapter.ab_map) {
|
||||||
auto orig = ab_map[it.first];
|
auto orig = ab_map[it.first];
|
||||||
@ -18461,6 +18463,7 @@ static int32_t llama_lora_restore_tensors(struct llama_context & lctx) {
|
|||||||
model.layers[il] = model.orig_layers[il]; // copy
|
model.layers[il] = model.orig_layers[il]; // copy
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
|
static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
|
||||||
@ -18498,8 +18501,8 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
|
|||||||
cur = ggml_add(ctx_build, cur, *tensor);
|
cur = ggml_add(ctx_build, cur, *tensor);
|
||||||
// TODO: scale
|
// TODO: scale
|
||||||
ggml_format_name(cur, "%s.merged", name.c_str());
|
ggml_format_name(cur, "%s.merged", name.c_str());
|
||||||
// LLAMA_LOG_INFO("LORA %s\n", cur->name);
|
// LLAMA_LOG_INFO("LORA %p %s\n", cur, cur->name);
|
||||||
tensor = &cur;
|
*tensor = cur;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
for (auto adapter : lctx.lora_adapters) {
|
for (auto adapter : lctx.lora_adapters) {
|
||||||
@ -18541,14 +18544,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
|
|||||||
patch_tensor(adapter, &layer.wq_b);
|
patch_tensor(adapter, &layer.wq_b);
|
||||||
patch_tensor(adapter, &layer.wkv_a_mqa);
|
patch_tensor(adapter, &layer.wkv_a_mqa);
|
||||||
patch_tensor(adapter, &layer.wkv_b);
|
patch_tensor(adapter, &layer.wkv_b);
|
||||||
patch_tensor(adapter, &layer.wq_cross);
|
|
||||||
patch_tensor(adapter, &layer.wk_cross);
|
|
||||||
patch_tensor(adapter, &layer.wv_cross);
|
|
||||||
patch_tensor(adapter, &layer.wo_cross);
|
|
||||||
patch_tensor(adapter, &layer.wq_enc);
|
|
||||||
patch_tensor(adapter, &layer.wk_enc);
|
|
||||||
patch_tensor(adapter, &layer.wv_enc);
|
|
||||||
patch_tensor(adapter, &layer.wo_enc);
|
|
||||||
|
|
||||||
patch_tensor(adapter, &layer.bq);
|
patch_tensor(adapter, &layer.bq);
|
||||||
patch_tensor(adapter, &layer.bk);
|
patch_tensor(adapter, &layer.bk);
|
||||||
@ -18556,10 +18551,6 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
|
|||||||
patch_tensor(adapter, &layer.bo);
|
patch_tensor(adapter, &layer.bo);
|
||||||
patch_tensor(adapter, &layer.bqkv);
|
patch_tensor(adapter, &layer.bqkv);
|
||||||
|
|
||||||
patch_tensor(adapter, &layer.attn_rel_b);
|
|
||||||
patch_tensor(adapter, &layer.attn_rel_b_enc);
|
|
||||||
patch_tensor(adapter, &layer.attn_rel_b_cross);
|
|
||||||
|
|
||||||
patch_tensor(adapter, &layer.ffn_norm);
|
patch_tensor(adapter, &layer.ffn_norm);
|
||||||
patch_tensor(adapter, &layer.ffn_norm_b);
|
patch_tensor(adapter, &layer.ffn_norm_b);
|
||||||
patch_tensor(adapter, &layer.ffn_post_norm);
|
patch_tensor(adapter, &layer.ffn_post_norm);
|
||||||
@ -18578,7 +18569,7 @@ static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml
|
|||||||
patch_tensor(adapter, &layer.ffn_gate_inp);
|
patch_tensor(adapter, &layer.ffn_gate_inp);
|
||||||
patch_tensor(adapter, &layer.ffn_gate_exps);
|
patch_tensor(adapter, &layer.ffn_gate_exps);
|
||||||
patch_tensor(adapter, &layer.ffn_down_exps);
|
patch_tensor(adapter, &layer.ffn_down_exps);
|
||||||
patch_tensor(adapter, &layer.ffn_up_exps );
|
patch_tensor(adapter, &layer.ffn_up_exps);
|
||||||
|
|
||||||
patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
|
patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
|
||||||
patch_tensor(adapter, &layer.ffn_gate_shexp);
|
patch_tensor(adapter, &layer.ffn_gate_shexp);
|
||||||
|
Loading…
Reference in New Issue
Block a user