mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-05 00:04:36 +00:00
falcon h2d + reenable vulkan
This commit is contained in:
parent
020b1745a0
commit
8564f79036
22
llama.cpp
22
llama.cpp
@ -3210,6 +3210,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
struct ggml_tensor * toDeviceTensor = nullptr;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (tokens) {
|
if (tokens) {
|
||||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
@ -3219,7 +3222,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||||
}
|
}
|
||||||
ggml_set_name(inp_tokens, "inp_tokens");
|
ggml_set_name(inp_tokens, "inp_tokens");
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
toDeviceTensor = inp_tokens;
|
||||||
|
#endif
|
||||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||||
} else {
|
} else {
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
@ -3232,6 +3237,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||||
}
|
}
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
toDeviceTensor = inpL;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
@ -3463,6 +3471,16 @@ static struct ggml_cgraph * llm_build_falcon(
|
|||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
ggml_free(ctx0);
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
if (lctx.ctx_kompute) {
|
||||||
|
if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
|
||||||
|
ggml_vk_h2d_all(lctx.ctx_kompute);
|
||||||
|
} else {
|
||||||
|
ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -6494,7 +6512,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
#elif defined(GGML_USE_KOMPUTE)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
// TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
|
// TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
|
||||||
if (ggml_vk_has_device() && params.n_gpu_layers > 0
|
if (ggml_vk_has_device() && params.n_gpu_layers > 0
|
||||||
&& model->arch == LLM_ARCH_LLAMA
|
&& (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
|
||||||
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|
&& (model->ftype == LLAMA_FTYPE_ALL_F32
|
||||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16
|
||||||
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
|
||||||
|
Loading…
Reference in New Issue
Block a user