mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 03:44:35 +00:00
add patch tensor function
This commit is contained in:
parent
67c5e14d06
commit
e9d7b6c05f
205
src/llama.cpp
205
src/llama.cpp
@ -2702,6 +2702,10 @@ struct llama_model {
|
|||||||
int64_t t_load_us = 0;
|
int64_t t_load_us = 0;
|
||||||
int64_t t_start_us = 0;
|
int64_t t_start_us = 0;
|
||||||
|
|
||||||
|
// used by lora, to save model's original tensors
|
||||||
|
std::vector<struct ggml_tensor *> orig_tensors;
|
||||||
|
std::vector<llama_layer> orig_layers;
|
||||||
|
|
||||||
~llama_model() {
|
~llama_model() {
|
||||||
for (struct ggml_context * ctx : ctxs) {
|
for (struct ggml_context * ctx : ctxs) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
@ -13491,6 +13495,10 @@ static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// forward declaration
|
||||||
|
static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build);
|
||||||
|
static int32_t llama_lora_restore_tensors(struct llama_context & lctx);
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph(
|
static struct ggml_cgraph * llama_build_graph(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_batch & batch,
|
const llama_batch & batch,
|
||||||
@ -13534,6 +13542,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
llm.init();
|
llm.init();
|
||||||
|
|
||||||
|
if (!lctx.lora_adapters.empty()) {
|
||||||
|
llama_lora_restore_tensors(lctx);
|
||||||
|
llama_lora_patch_tensors(lctx, llm.ctx0);
|
||||||
|
}
|
||||||
|
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
{
|
{
|
||||||
@ -18304,10 +18317,12 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
|
printf("n_tensors_per_layer %d\n", n_tensors_per_layer);
|
||||||
|
|
||||||
// count layer buffer types
|
// count layer buffer types
|
||||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
std::map<ggml_backend_buffer_type_t, int> buft_tensor_count;
|
||||||
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
||||||
buft_layer_count[model.buft_layer[i].buft]++;
|
buft_tensor_count[model.buft_layer[i].buft] += n_tensors_per_layer;
|
||||||
}
|
}
|
||||||
|
buft_tensor_count[model.buft_input.buft] += n_inp_tensors;
|
||||||
|
buft_tensor_count[model.buft_output.buft] += n_out_tensors;
|
||||||
|
|
||||||
// allocate contexts
|
// allocate contexts
|
||||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
@ -18320,13 +18335,11 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
};
|
};
|
||||||
return ggml_init(params);
|
return ggml_init(params);
|
||||||
};
|
};
|
||||||
for (auto & it : buft_layer_count) {
|
for (auto & it : buft_tensor_count) {
|
||||||
int n_layers = it.second;
|
int n_tensors = it.second;
|
||||||
printf("buf %p layers %d\n", it.first, it.second);
|
// LLAMA_LOG_INFO("buf %p layers %d\n", it.first, it.second);
|
||||||
ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer);
|
ctx_map[it.first] = new_ggml_ctx(2*n_tensors); // for a+b tensors
|
||||||
}
|
}
|
||||||
//ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors);
|
|
||||||
//ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// bundle lora_a and lora_b into pairs
|
// bundle lora_a and lora_b into pairs
|
||||||
@ -18356,22 +18369,29 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
// add tensors
|
// add tensors
|
||||||
for (auto & it : ab_map) {
|
for (auto & it : ab_map) {
|
||||||
std::string name = it.first;
|
std::string name = it.first;
|
||||||
|
const char * cname = name.c_str();
|
||||||
lora_weight & w = it.second;
|
lora_weight & w = it.second;
|
||||||
GGML_ASSERT(w.a != nullptr);
|
GGML_ASSERT(w.a != nullptr);
|
||||||
GGML_ASSERT(w.b != nullptr);
|
GGML_ASSERT(w.b != nullptr);
|
||||||
int il = -1;
|
int il = -1;
|
||||||
sscanf(name.c_str(), "blk.%d.", &il);
|
sscanf(cname, "blk.%d.", &il);
|
||||||
|
struct ggml_context * dev_ctx; // device ctx
|
||||||
if (il >= 0) {
|
if (il >= 0) {
|
||||||
printf("%s %p %p\n", name.c_str(), w.a, w.b);
|
dev_ctx = ctx_map.at(model.buft_layer[il].buft);
|
||||||
struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft);
|
} else if (strstr(cname, "tok") == 0) {
|
||||||
|
dev_ctx = ctx_map.at(model.buft_input.buft);
|
||||||
|
} else if (strstr(cname, "output") == 0) {
|
||||||
|
dev_ctx = ctx_map.at(model.buft_output.buft);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_WARN("%s: discard tensor '%s'\n", __func__, cname);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// LLAMA_LOG_INFO("%s %p %p\n", cname, w.a, w.b);
|
||||||
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
|
||||||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
||||||
ggml_set_name(tensor_a, w.a->name);
|
ggml_set_name(tensor_a, w.a->name);
|
||||||
ggml_set_name(tensor_b, w.b->name);
|
ggml_set_name(tensor_b, w.b->name);
|
||||||
adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
|
adapter.ab_map[name] = lora_weight(tensor_a, tensor_b);
|
||||||
} else {
|
|
||||||
// TODO: process output & token_embd tensors
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate tensors / buffers and zero
|
// allocate tensors / buffers and zero
|
||||||
@ -18402,8 +18422,9 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
if (read_buf.size() < size) {
|
if (read_buf.size() < size) {
|
||||||
read_buf.resize(size);
|
read_buf.resize(size);
|
||||||
}
|
}
|
||||||
|
gguf_file.seek(offs, SEEK_SET);
|
||||||
gguf_file.read_raw(read_buf.data(), size);
|
gguf_file.read_raw(read_buf.data(), size);
|
||||||
printf("%s: %s size=%ld\n", __func__, orig->name, size);
|
// LLAMA_LOG_INFO("%s: %s size=%ld\n", __func__, orig->name, size);
|
||||||
return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
return ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
|
||||||
};
|
};
|
||||||
for (auto & it : adapter.ab_map) {
|
for (auto & it : adapter.ab_map) {
|
||||||
@ -18414,11 +18435,165 @@ static int llama_lora_adapter_init_internal(const struct llama_model & model, co
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||||
|
|
||||||
// free ctx for reading gguf
|
// free ctx for reading gguf
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int32_t llama_lora_restore_tensors(struct llama_context & lctx) {
|
||||||
|
// TODO @ngxson : not ideal, but "const" is discarded to make it work
|
||||||
|
struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
|
||||||
|
if (!model.orig_tensors.empty()) {
|
||||||
|
size_t i = 0;
|
||||||
|
model.tok_embd = model.orig_tensors[i++];
|
||||||
|
model.type_embd = model.orig_tensors[i++];
|
||||||
|
model.pos_embd = model.orig_tensors[i++];
|
||||||
|
model.tok_norm = model.orig_tensors[i++];
|
||||||
|
model.tok_norm_b = model.orig_tensors[i++];
|
||||||
|
model.output_norm = model.orig_tensors[i++];
|
||||||
|
model.output_norm_b = model.orig_tensors[i++];
|
||||||
|
model.output = model.orig_tensors[i++];
|
||||||
|
model.output_b = model.orig_tensors[i++];
|
||||||
|
model.output_norm_enc = model.orig_tensors[i++];
|
||||||
|
for (size_t il = 0; il < model.orig_layers.size(); il++) {
|
||||||
|
model.layers[il] = model.orig_layers[il]; // copy
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int32_t llama_lora_patch_tensors(struct llama_context & lctx, struct ggml_context * ctx_build) {
|
||||||
|
GGML_ASSERT(!lctx.lora_adapters.empty());
|
||||||
|
// TODO @ngxson : not ideal, but "const" is discarded to make it work
|
||||||
|
struct llama_model & model = const_cast<struct llama_model &>(lctx.model);
|
||||||
|
|
||||||
|
// save all original tensors
|
||||||
|
if (model.orig_tensors.empty()) {
|
||||||
|
model.orig_tensors.push_back(model.tok_embd);
|
||||||
|
model.orig_tensors.push_back(model.type_embd);
|
||||||
|
model.orig_tensors.push_back(model.pos_embd);
|
||||||
|
model.orig_tensors.push_back(model.tok_norm);
|
||||||
|
model.orig_tensors.push_back(model.tok_norm_b);
|
||||||
|
model.orig_tensors.push_back(model.output_norm);
|
||||||
|
model.orig_tensors.push_back(model.output_norm_b);
|
||||||
|
model.orig_tensors.push_back(model.output);
|
||||||
|
model.orig_tensors.push_back(model.output_b);
|
||||||
|
model.orig_tensors.push_back(model.output_norm_enc);
|
||||||
|
model.orig_layers.reserve(model.layers.size());
|
||||||
|
for (llama_layer layer : model.layers) {
|
||||||
|
model.orig_layers.push_back(layer); // copy
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// patch tensors
|
||||||
|
auto patch_tensor = [&](struct llama_lora_adapter * adapter, struct ggml_tensor ** tensor) {
|
||||||
|
if (*tensor == nullptr) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string name = ggml_get_name(*tensor);
|
||||||
|
if (adapter->ab_map.find(name) != adapter->ab_map.end()) {
|
||||||
|
auto lora_w = adapter->ab_map[name];
|
||||||
|
struct ggml_tensor * cur = ggml_mul_mat(ctx_build, lora_w.a, lora_w.b);
|
||||||
|
cur = ggml_add(ctx_build, cur, *tensor);
|
||||||
|
// TODO: scale
|
||||||
|
ggml_format_name(cur, "%s.merged", name.c_str());
|
||||||
|
// LLAMA_LOG_INFO("LORA %s\n", cur->name);
|
||||||
|
tensor = &cur;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for (auto adapter : lctx.lora_adapters) {
|
||||||
|
patch_tensor(adapter, &model.tok_embd);
|
||||||
|
patch_tensor(adapter, &model.type_embd);
|
||||||
|
patch_tensor(adapter, &model.pos_embd);
|
||||||
|
patch_tensor(adapter, &model.tok_norm);
|
||||||
|
patch_tensor(adapter, &model.tok_norm_b);
|
||||||
|
patch_tensor(adapter, &model.output_norm);
|
||||||
|
patch_tensor(adapter, &model.output_norm_b);
|
||||||
|
patch_tensor(adapter, &model.output);
|
||||||
|
patch_tensor(adapter, &model.output_b);
|
||||||
|
patch_tensor(adapter, &model.output_norm_enc);
|
||||||
|
for (llama_layer & layer : model.layers) {
|
||||||
|
patch_tensor(adapter, &layer.attn_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_norm_2);
|
||||||
|
patch_tensor(adapter, &layer.attn_norm_2_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_q_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_q_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_k_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_k_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_out_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_out_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_q_a_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_kv_a_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_sub_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_post_norm);
|
||||||
|
patch_tensor(adapter, &layer.ffn_sub_norm);
|
||||||
|
patch_tensor(adapter, &layer.attn_norm_cross);
|
||||||
|
patch_tensor(adapter, &layer.attn_norm_enc);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.wq);
|
||||||
|
patch_tensor(adapter, &layer.wk);
|
||||||
|
patch_tensor(adapter, &layer.wv);
|
||||||
|
patch_tensor(adapter, &layer.wo);
|
||||||
|
patch_tensor(adapter, &layer.wqkv);
|
||||||
|
patch_tensor(adapter, &layer.wq_a);
|
||||||
|
patch_tensor(adapter, &layer.wq_b);
|
||||||
|
patch_tensor(adapter, &layer.wkv_a_mqa);
|
||||||
|
patch_tensor(adapter, &layer.wkv_b);
|
||||||
|
patch_tensor(adapter, &layer.wq_cross);
|
||||||
|
patch_tensor(adapter, &layer.wk_cross);
|
||||||
|
patch_tensor(adapter, &layer.wv_cross);
|
||||||
|
patch_tensor(adapter, &layer.wo_cross);
|
||||||
|
patch_tensor(adapter, &layer.wq_enc);
|
||||||
|
patch_tensor(adapter, &layer.wk_enc);
|
||||||
|
patch_tensor(adapter, &layer.wv_enc);
|
||||||
|
patch_tensor(adapter, &layer.wo_enc);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.bq);
|
||||||
|
patch_tensor(adapter, &layer.bk);
|
||||||
|
patch_tensor(adapter, &layer.bv);
|
||||||
|
patch_tensor(adapter, &layer.bo);
|
||||||
|
patch_tensor(adapter, &layer.bqkv);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.attn_rel_b);
|
||||||
|
patch_tensor(adapter, &layer.attn_rel_b_enc);
|
||||||
|
patch_tensor(adapter, &layer.attn_rel_b_cross);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.ffn_norm);
|
||||||
|
patch_tensor(adapter, &layer.ffn_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.ffn_post_norm);
|
||||||
|
patch_tensor(adapter, &layer.layer_out_norm);
|
||||||
|
patch_tensor(adapter, &layer.layer_out_norm_b);
|
||||||
|
patch_tensor(adapter, &layer.ffn_norm_exps);
|
||||||
|
patch_tensor(adapter, &layer.ffn_norm_enc);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate);
|
||||||
|
patch_tensor(adapter, &layer.ffn_down);
|
||||||
|
patch_tensor(adapter, &layer.ffn_up);
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_enc);
|
||||||
|
patch_tensor(adapter, &layer.ffn_down_enc);
|
||||||
|
patch_tensor(adapter, &layer.ffn_up_enc);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_inp);
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_exps);
|
||||||
|
patch_tensor(adapter, &layer.ffn_down_exps);
|
||||||
|
patch_tensor(adapter, &layer.ffn_up_exps );
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_inp_shexp);
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_shexp);
|
||||||
|
patch_tensor(adapter, &layer.ffn_down_shexp);
|
||||||
|
patch_tensor(adapter, &layer.ffn_up_shexp);
|
||||||
|
|
||||||
|
patch_tensor(adapter, &layer.ffn_gate_b);
|
||||||
|
patch_tensor(adapter, &layer.ffn_down_b);
|
||||||
|
patch_tensor(adapter, &layer.ffn_up_b);
|
||||||
|
patch_tensor(adapter, &layer.ffn_act);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user