mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 20:04:35 +00:00
llama : support offloading result_norm + comments
This commit is contained in:
parent
51c4f9ee9f
commit
4e98897ede
17
llama.cpp
17
llama.cpp
@ -5452,12 +5452,16 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
} while (0);
|
} while (0);
|
||||||
|
|
||||||
// offload layers
|
// offload layers
|
||||||
|
// TODO: this code will be obsoleted with backend v2
|
||||||
{
|
{
|
||||||
const int n_layer = model.hparams.n_layer;
|
const int n_layer = model.hparams.n_layer;
|
||||||
|
|
||||||
const int n_gpu_layers = model.n_gpu_layers;
|
const int n_gpu_layers = model.n_gpu_layers;
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
|
// should we offload the final norm? yes if we are not computing embeddings
|
||||||
|
const bool off_res_norm = !lctx.embedding.empty();
|
||||||
|
|
||||||
// offload functions set the tensor output backend to GPU
|
// offload functions set the tensor output backend to GPU
|
||||||
// tensors are GPU-accelerated if any input or the output has been offloaded
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating
|
offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating
|
||||||
@ -5566,7 +5570,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
{ "out_norm_0", offload_func_nr },
|
{ "out_norm_0", offload_func_nr },
|
||||||
{ "out_norm_0_w", offload_func_nr },
|
{ "out_norm_0_w", offload_func_nr },
|
||||||
|
|
||||||
//{ "result_norm", offload_func_nr }, // TODO CPU + GPU mirrored backend
|
{ "result_norm", off_res_norm ? offload_func_nr : ggml_offload_nop },
|
||||||
//{ "result_output", offload_func },
|
//{ "result_output", offload_func },
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -5584,7 +5588,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
const std::string name = cur->name;
|
const std::string name = cur->name;
|
||||||
|
|
||||||
if (k_offload_func.find(name) == k_offload_func.end()) {
|
const auto it = k_offload_func.find(name);
|
||||||
|
if (it == k_offload_func.end()) {
|
||||||
// if a tensor that is not view hasn't been offloaded, we warn the user
|
// if a tensor that is not view hasn't been offloaded, we warn the user
|
||||||
if (worst_case && cur->view_src == nullptr) {
|
if (worst_case && cur->view_src == nullptr) {
|
||||||
LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__,
|
LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__,
|
||||||
@ -5595,7 +5600,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// count the number of layers and respect the provided n_gpu_layers
|
// count the number of layers and respect the provided n_gpu_layers
|
||||||
offload_func_t f = k_offload_func.at(name);
|
offload_func_t f = it->second;
|
||||||
if (f == offload_func) {
|
if (f == offload_func) {
|
||||||
if (ofn[name]++ < i_gpu_start) {
|
if (ofn[name]++ < i_gpu_start) {
|
||||||
f = ggml_offload_nop;
|
f = ggml_offload_nop;
|
||||||
@ -5753,11 +5758,13 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
||||||
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
const bool full_offload_supported =
|
||||||
|
model.arch == LLM_ARCH_LLAMA ||
|
||||||
model.arch == LLM_ARCH_BAICHUAN ||
|
model.arch == LLM_ARCH_BAICHUAN ||
|
||||||
model.arch == LLM_ARCH_FALCON ||
|
model.arch == LLM_ARCH_FALCON ||
|
||||||
model.arch == LLM_ARCH_REFACT ||
|
model.arch == LLM_ARCH_REFACT ||
|
||||||
model.arch == LLM_ARCH_MPT;
|
model.arch == LLM_ARCH_MPT;
|
||||||
|
|
||||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
||||||
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
||||||
n_threads = 1;
|
n_threads = 1;
|
||||||
@ -5803,6 +5810,8 @@ static int llama_decode_internal(
|
|||||||
//}
|
//}
|
||||||
|
|
||||||
// extract logits
|
// extract logits
|
||||||
|
// TODO: do not compute and extract logits if only embeddings are needed
|
||||||
|
// need to update the graphs to skip "result_output"
|
||||||
{
|
{
|
||||||
auto & logits_out = lctx.logits;
|
auto & logits_out = lctx.logits;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user