llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading

This commit is contained in:
Georgi Gerganov 2023-10-30 12:14:23 +02:00
parent 792d1a1b16
commit a3f80013ad
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735

115
llama.cpp
View File

@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama(
model.layers[il].ffn_gate, NULL,
model.layers[il].ffn_down, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "inpFF_+_result_w2", il);
cb(cur, "inpFF_ffn_out", il);
// input for next layer
inpL = cur;
@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun(
model.layers[il].ffn_gate, NULL,
model.layers[il].ffn_down, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "inpFF_+_result_w2", il);
cb(cur, "inpFF_ffn_out", il);
// input for next layer
inpL = cur;
@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon(
NULL, NULL,
model.layers[il].ffn_down, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, attn_out);
cb(cur, "inpFF_+_result_w2", il);
cb(cur, "inpFF_ffn_out", il);
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "inpL_+_inpFF_+_result_w2", il);
cb(cur, "inpL_inpFF_ffn_out", il);
// input for next layer
inpL = cur;
@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder(
cb(KQ_mask, "KQ_mask", -1);
pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos);
cb(pos, "pos_embd", -1);
inpL = ggml_add(ctx0, embd, pos);
cb(inpL, "inpL", -1);
@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder(
// Add the input
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "inpL_+_result_wo", il);
cb(cur, "inpL_kqv_out", il);
struct ggml_tensor * inpFF = cur;
@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder(
NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
inpL = ggml_add(ctx0, cur, inpFF);
cb(inpL, "inpL_inpFF_ffn_out", il);
}
cur = llm_build_norm(ctx0, inpL,
@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon(
NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "inpFF_+_result_w2", il);
cb(cur, "inpFF_ffn_out", il);
inpL = cur;
}
@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact(
model.layers[il].ffn_gate, NULL,
model.layers[il].ffn_down, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, inpFF);
cb(cur, "inpFF_+_result_w2", il);
cb(cur, "inpFF_ffn_out", il);
// input for next layer
inpL = cur;
@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom(
// Add the input
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "inpL_+_result_wo", il);
cb(cur, "inpL_kqv_out", il);
struct ggml_tensor * inpFF = cur;
@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom(
NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
inpL = ggml_add(ctx0, cur, inpFF);
cb(inpL, "inpFF_+_result_w2", il);
cb(inpL, "inpFF_ffn_out", il);
}
cur = llm_build_norm(ctx0, inpL,
@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt(
// Add the input
cur = ggml_add(ctx0, cur, inpL);
cb(cur, "inpL_+_result_wo", il);
cb(cur, "inpL_kqv_out", il);
struct ggml_tensor * attn_out = cur;
@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt(
NULL, NULL,
model.layers[il].ffn_down, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_result", il);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, attn_out);
cb(cur, "inpL_+_inpFF_+_result_w2", il);
cb(cur, "inpL_inpFF_ffn_out", il);
// input for next layer
inpL = cur;
@ -4777,6 +4778,7 @@ enum llm_offload_func_e {
OFFLOAD_FUNC_OUT,
};
// TODO: will be removed with backend v2
struct llm_offload_trie {
struct node {
~node() {
@ -4850,10 +4852,12 @@ struct llm_offload_trie {
node * root = nullptr;
};
// TODO: will be removed with backend v2
static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
//{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
{ "inp_pos", OFFLOAD_FUNC_NR },
{ "pos_embd", OFFLOAD_FUNC_NR },
{ "KQ_mask", OFFLOAD_FUNC_NR },
{ "K_shift", OFFLOAD_FUNC_NR },
@ -4902,7 +4906,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "kqv_wo", OFFLOAD_FUNC_V },
{ "kqv_out", OFFLOAD_FUNC_V },
{ "inpL_+_result_wo", OFFLOAD_FUNC },
{ "inpL_kqv_out", OFFLOAD_FUNC },
{ "inpFF", OFFLOAD_FUNC },
{ "ffn_norm", OFFLOAD_FUNC },
@ -4914,15 +4918,15 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
{ "ffn_gate_par", OFFLOAD_FUNC },
{ "ffn_down", OFFLOAD_FUNC },
{ "ffn_down_b", OFFLOAD_FUNC },
{ "ffn_result", OFFLOAD_FUNC },
{ "ffn_out", OFFLOAD_FUNC },
{ "ffn_silu", OFFLOAD_FUNC },
{ "ffn_gelu", OFFLOAD_FUNC },
{ "ffn_relu", OFFLOAD_FUNC },
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
{ "inpFF_+_result_w2", OFFLOAD_FUNC },
{ "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC },
{ "inpFF_ffn_out", OFFLOAD_FUNC },
{ "inpL_inpFF_ffn_out", OFFLOAD_FUNC },
{ "result_norm", OFFLOAD_FUNC_EMB },
{ "result_output", OFFLOAD_FUNC_OUT },
@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph(
bool alloc_inp_KQ_mask = false;
bool alloc_inp_K_shift = false;
#ifdef GGML_USE_CUBLAS
const bool do_offload = true;
#else
const bool do_offload = true; // TODO: set to false after finishing refactoring
#endif
int n_non_view = 0; // number of non-view tensors that have been processed by the callback
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
if (il >= 0) {
@ -5053,23 +5065,23 @@ static struct ggml_cgraph * llama_build_graph(
alloc_inp_K_shift = true;
}
//
// offload layers
//
// TODO: this code will be obsoleted with backend v2
#ifdef GGML_USE_CUBLAS
const bool do_offload = true;
#else
const bool do_offload = true; // TODO: set to false after finishing refactoring
#endif
if (!do_offload) {
// view tensors are not processed further
if (cur->view_src != nullptr) {
return;
}
// view tensors are not offloaded
if (cur->view_src != nullptr) {
if (cur->op != GGML_OP_NONE) {
n_non_view++;
}
//
// offload layers
//
// TODO: will be removed with backend v2
//#define LLAMA_OFFLOAD_DEBUG
if (!do_offload) {
return;
}
@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph(
llm_offload_func_e func_e = k_offload_func_trie.find(name);
if (func_e == OFFLOAD_FUNC_NOP) {
#ifdef LLAMA_OFFLOAD_DEBUG
// if a tensor hasn't been offloaded, we warn the user
if (worst_case) {
LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
}
#endif
return;
}
@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph(
// apply offload function to the tensor
func(cur);
#ifdef LLAMA_OFFLOAD_DEBUG
if (worst_case) {
LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
}
#endif
};
struct ggml_cgraph * result = NULL;
@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph(
GGML_ASSERT(false);
}
if (worst_case) {
int n_non_view_total = 0;
for (int i = 0; i < result->n_nodes; ++i) {
if (result->nodes[i]->view_src == nullptr) {
n_non_view_total++;
}
}
LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
#ifdef LLAMA_OFFLOAD_DEBUG
if (n_non_view != n_non_view_total) {
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
}
#endif
}
return result;
}