mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 20:04:35 +00:00
llama : fix res_norm offloading
This commit is contained in:
parent
e14aa46151
commit
79617902ea
31
llama.cpp
31
llama.cpp
@ -5456,13 +5456,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
// should we offload the final norm? yes if we are not computing embeddings
|
// should we offload the final norm? yes if we are not computing embeddings
|
||||||
const bool off_res_norm = !lctx.embedding.empty();
|
const bool off_res_norm = lctx.embedding.empty();
|
||||||
|
|
||||||
// offload functions set the tensor output backend to GPU
|
// offload functions set the tensor output backend to GPU
|
||||||
// tensors are GPU-accelerated if any input or the output has been offloaded
|
// tensors are GPU-accelerated if any input or the output has been offloaded
|
||||||
offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating
|
offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating
|
||||||
offload_func_t offload_func_kq = ggml_offload_nop;
|
offload_func_t offload_func_kq = ggml_offload_nop;
|
||||||
offload_func_t offload_func_v = ggml_offload_nop;
|
offload_func_t offload_func_v = ggml_offload_nop;
|
||||||
|
offload_func_t offload_func_emb = ggml_offload_nop;
|
||||||
|
offload_func_t offload_func_out = ggml_offload_nop;
|
||||||
offload_func_t offload_func = ggml_offload_nop;
|
offload_func_t offload_func = ggml_offload_nop;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
@ -5476,10 +5478,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
offload_func_emb = off_res_norm ? ggml_cuda_assign_buffers_no_alloc : ggml_offload_nop;
|
||||||
|
offload_func_out = ggml_offload_nop;
|
||||||
|
|
||||||
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
static const std::unordered_map<std::string, offload_func_t> k_offload_func = {
|
static const std::unordered_map<offload_func_t, std::string> k_offload_func_name = {
|
||||||
|
{ ggml_offload_nop, "CPU" },
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
{ ggml_cuda_assign_buffers_no_alloc, "GPU (CUDA)" },
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::unordered_map<std::string, offload_func_t> k_offload_func = {
|
||||||
{ "KQ_mask", offload_func_kq },
|
{ "KQ_mask", offload_func_kq },
|
||||||
{ "KQ_pos", offload_func_kq },
|
{ "KQ_pos", offload_func_kq },
|
||||||
{ "K_shift", offload_func_kq },
|
{ "K_shift", offload_func_kq },
|
||||||
@ -5566,15 +5578,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
{ "out_norm_0", offload_func_nr },
|
{ "out_norm_0", offload_func_nr },
|
||||||
{ "out_norm_0_w", offload_func_nr },
|
{ "out_norm_0_w", offload_func_nr },
|
||||||
|
|
||||||
{ "result_norm", off_res_norm ? offload_func_nr : ggml_offload_nop },
|
{ "result_norm", offload_func_emb },
|
||||||
//{ "result_output", offload_func },
|
{ "result_output", offload_func_out },
|
||||||
};
|
|
||||||
|
|
||||||
static const std::unordered_map<offload_func_t, std::string> k_offload_func_name = {
|
|
||||||
{ ggml_offload_nop, "CPU" },
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
{ ggml_cuda_assign_buffers_no_alloc, "GPU (CUDA)" },
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unordered_map<std::string, int> ofn;
|
std::unordered_map<std::string, int> ofn;
|
||||||
@ -5591,7 +5596,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
const auto it = k_offload_func.find(name);
|
const auto it = k_offload_func.find(name);
|
||||||
if (it == k_offload_func.end()) {
|
if (it == k_offload_func.end()) {
|
||||||
// if a tensor that is not view hasn't been offloaded, we warn the user
|
// if a tensor hasn't been offloaded, we warn the user
|
||||||
if (worst_case) {
|
if (worst_case) {
|
||||||
LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__,
|
LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__,
|
||||||
i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837");
|
i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837");
|
||||||
@ -5602,7 +5607,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
|
|
||||||
// count the number of layers and respect the provided n_gpu_layers
|
// count the number of layers and respect the provided n_gpu_layers
|
||||||
offload_func_t f = it->second;
|
offload_func_t f = it->second;
|
||||||
if (f == offload_func) {
|
if (n_gpu_layers < n_layer && f == offload_func) {
|
||||||
if (ofn[name]++ < i_gpu_start) {
|
if (ofn[name]++ < i_gpu_start) {
|
||||||
f = ggml_offload_nop;
|
f = ggml_offload_nop;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user