From e8dc55d0065d076d4c20f3c4bfca562701b4edfe Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Tue, 30 Jan 2024 19:04:37 -0500 Subject: [PATCH] kompute : llama-bench support and ggml_cpu_has_kompute() (#5226) --- common/common.cpp | 1 + examples/llama-bench/llama-bench.cpp | 15 +++++++++++---- ggml.c | 11 ++++++++++- ggml.h | 1 + llama.cpp | 5 ----- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 288013676..0dd1c50cf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1521,6 +1521,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); + fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index f239415d3..542cc7bb8 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -563,6 +563,7 @@ struct test { static const bool cuda; static const bool opencl; static const bool vulkan; + static const bool kompute; static const bool metal; static const bool gpu_blas; static const bool blas; @@ -647,6 +648,9 @@ struct test { if (vulkan) { return "Vulkan"; } + if (kompute) { + return "Kompute"; + } if (metal) { return "Metal"; } @@ -662,7 +666,7 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas", + "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", @@ -686,8 +690,9 @@ struct test { field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" || - field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { + if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || + field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" || + field == "mul_mat_q") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -714,7 +719,8 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), + std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), + std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), @@ -743,6 +749,7 @@ const int test::build_number = LLAMA_BUILD_NUMBER; const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); +const bool test::kompute = !!ggml_cpu_has_kompute(); const bool test::metal = !!ggml_cpu_has_metal(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::blas = !!ggml_cpu_has_blas(); diff --git a/ggml.c b/ggml.c index a7a9ea319..b2c8baaa8 100644 --- a/ggml.c +++ b/ggml.c @@ -20473,6 +20473,14 @@ int ggml_cpu_has_vulkan(void) { #endif } +int ggml_cpu_has_kompute(void) { +#if defined(GGML_USE_KOMPUTE) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_sycl(void) { #if defined(GGML_USE_SYCL) return 1; @@ -20482,7 +20490,8 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl(); + return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || + ggml_cpu_has_sycl(); } int ggml_cpu_has_sse3(void) { diff --git a/ggml.h b/ggml.h index bf782e6ad..afc87b843 100644 --- a/ggml.h +++ b/ggml.h @@ -2266,6 +2266,7 @@ extern "C" { GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); + GGML_API int ggml_cpu_has_kompute (void); GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_ssse3 (void); diff --git a/llama.cpp b/llama.cpp index 7b9a5c079..a490eeab2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6878,11 +6878,6 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } - const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; - if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) { - n_threads = 1; - } - #ifdef GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);