falcon h2d + reenable vulkan

2025-01-02 14:54:35 +00:00 · 2023-10-04 21:03:27 -07:00 · 2023-10-04 21:03:27 -07:00 · 8564f79036
commit 8564f79036
parent 020b1745a0
1 changed files with 20 additions and 2 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3210,6 +3210,9 @@ static struct ggml_cgraph * llm_build_falcon(

    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif

    if (tokens) {
        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -3219,7 +3222,9 @@ static struct ggml_cgraph * llm_build_falcon(
            memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
        }
        ggml_set_name(inp_tokens, "inp_tokens");
-
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
    } else {
 #ifdef GGML_USE_MPI
@ -3232,6 +3237,9 @@ static struct ggml_cgraph * llm_build_falcon(
        if (!ggml_allocr_is_measure(lctx.alloc)) {
            memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
        }
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
    }

    const int i_gpu_start = n_layer - n_gpu_layers;
@ -3463,6 +3471,16 @@ static struct ggml_cgraph * llm_build_falcon(
    ggml_build_forward_expand(gf, cur);

    ggml_free(ctx0);
+ 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif

    return gf;
 }
@ -6494,7 +6512,7 @@ struct llama_context * llama_new_context_with_model(
 #elif defined(GGML_USE_KOMPUTE)
    // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
    if (ggml_vk_has_device() && params.n_gpu_layers > 0
-        && model->arch == LLM_ARCH_LLAMA
+        && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
        && (model->ftype == LLAMA_FTYPE_ALL_F32
            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0