falcon h2d + reenable vulkan

This commit is contained in:
Aaron Miller 2023-10-04 21:03:27 -07:00 committed by cebtenzzre
parent 020b1745a0
commit 8564f79036

View File

@ -3210,6 +3210,9 @@ static struct ggml_cgraph * llm_build_falcon(
struct ggml_tensor * cur; struct ggml_tensor * cur;
struct ggml_tensor * inpL; struct ggml_tensor * inpL;
#if defined(GGML_USE_KOMPUTE)
struct ggml_tensor * toDeviceTensor = nullptr;
#endif
if (tokens) { if (tokens) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -3219,7 +3222,9 @@ static struct ggml_cgraph * llm_build_falcon(
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens)); memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
} }
ggml_set_name(inp_tokens, "inp_tokens"); ggml_set_name(inp_tokens, "inp_tokens");
#if defined(GGML_USE_KOMPUTE)
toDeviceTensor = inp_tokens;
#endif
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else { } else {
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
@ -3232,6 +3237,9 @@ static struct ggml_cgraph * llm_build_falcon(
if (!ggml_allocr_is_measure(lctx.alloc)) { if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL)); memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
} }
#if defined(GGML_USE_KOMPUTE)
toDeviceTensor = inpL;
#endif
} }
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -3464,6 +3472,16 @@ static struct ggml_cgraph * llm_build_falcon(
ggml_free(ctx0); ggml_free(ctx0);
#if defined(GGML_USE_KOMPUTE)
if (lctx.ctx_kompute) {
if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
ggml_vk_h2d_all(lctx.ctx_kompute);
} else {
ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
}
}
#endif
return gf; return gf;
} }
@ -6494,7 +6512,7 @@ struct llama_context * llama_new_context_with_model(
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE)
// TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
if (ggml_vk_has_device() && params.n_gpu_layers > 0 if (ggml_vk_has_device() && params.n_gpu_layers > 0
&& model->arch == LLM_ARCH_LLAMA && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
&& (model->ftype == LLAMA_FTYPE_ALL_F32 && (model->ftype == LLAMA_FTYPE_ALL_F32
|| model->ftype == LLAMA_FTYPE_MOSTLY_F16 || model->ftype == LLAMA_FTYPE_MOSTLY_F16
|| model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0