From 88954f7fbd31aeb8c75140edee03e7a8ad5e2d9c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jul 2024 18:57:44 +0300 Subject: [PATCH 01/21] tests : fix printfs (#8068) --- tests/test-chat-template.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 46a7d3aea..a8222caee 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -146,7 +146,7 @@ int main(void) { auto fmt_sys = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false); printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str()); - printf("-------------------------\n", output.c_str()); + printf("-------------------------\n"); return output; }; assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n"); @@ -165,7 +165,7 @@ int main(void) { auto fmt_single = [&](std::string tmpl) { auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true); printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str()); - printf("-------------------------\n", output.c_str()); + printf("-------------------------\n"); return output; }; assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n"); From bf5a81df375f1c71e41462e1f48d57db359c9e80 Mon Sep 17 00:00:00 2001 From: "Andreas (Andi) Kunar" Date: Thu, 25 Jul 2024 18:01:00 +0200 Subject: [PATCH 02/21] ggml : fix build on Windows with Snapdragon X (#8531) * Improvements for Windows with Snapdragon X * Revert "Improvements for Windows with Snapdragon X" This reverts commit bf21397ae5ea7c73d3494db3b91505599909227d. * Improvements for Windows with Snapdragon X * WOA build clarifications * WIndows on ARM build clarifications * cmake build for Windows clarifications * Update docs/build.md Co-authored-by: Georgi Gerganov --------- Co-authored-by: AndreasKunar Co-authored-by: Georgi Gerganov --- docs/build.md | 13 ++++++++++++- ggml/src/ggml-aarch64.c | 12 ++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/docs/build.md b/docs/build.md index 916fcf22d..d9d12c467 100644 --- a/docs/build.md +++ b/docs/build.md @@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options. make ``` - - On Windows: + - On Windows (x86/x64 only, arm64 requires cmake): 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 2. Extract `w64devkit` on your pc. @@ -60,6 +60,17 @@ In order to build llama.cpp you have four different options. cmake -B build -G "Xcode" cmake --build build --config Debug ``` + - Building for Windows (x86, x64 and arm64) with MSVC or clang as compilers: + - Install Visual Studio 2022, e.g. via the [Community Edition](https://visualstudio.microsoft.com/de/vs/community/). In the installer, select at least the following options (this also automatically installs the required additional tools like CMake,...): + - Tab Workload: Desktop-development with C++ + - Tab Components (select quickly via search): C++-_CMake_ Tools for Windows, _Git_ for Windows, C++-_Clang_ Compiler for Windows, MS-Build Support for LLVM-Toolset (clang) + - Please remember to always use a Developer Command Prompt / PowerShell for VS2022 for git, build, test + - For Windows on ARM (arm64, WoA) build with: + ```bash + cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF + cmake --build build-arm64-windows-llvm-release + ``` + Note: Building for arm64 could also be done just with MSVC (with the build-arm64-windows-MSVC preset, or the standard CMake build instructions). But MSVC does not support inline ARM assembly-code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. - Using `gmake` (FreeBSD): diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 26535b1c4..af53dea17 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); -#elif defined(__ARM_NEON) && defined(__aarch64__) +#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) if (svcntw() == 8) { const void * b_ptr = vx; const void * a_ptr = vy; @@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * #if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) && "__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance"); -#elif defined(__ARM_NEON) && defined(__aarch64__) +#elif defined(__ARM_NEON) && defined(__aarch64__) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif -#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(ncols_interleaved); UNUSED(blocklen); -#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) if (svcntw() == 8) { const void * b_ptr = vx; const void * a_ptr = vy; From 4226a8d10e3904db3a1297919fe6c7f06beba6c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 25 Jul 2024 19:57:31 +0300 Subject: [PATCH 03/21] llama : fix build + fix fabs compile warnings (#8683) ggml-ci --- src/llama-grammar.h | 2 -- src/llama.cpp | 10 ++-------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 8e578e09f..695ea0632 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -13,8 +13,6 @@ struct llama_grammar { llama_partial_utf8 partial_utf8; }; -struct llama_grammar * llama_get_grammar(struct llama_context * ctx); - // // internal API // diff --git a/src/llama.cpp b/src/llama.cpp index 972f870b0..bc0183741 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2657,7 +2657,6 @@ struct llama_context { llama_context(const llama_model & model) : model(model) , sampling(llama_n_vocab(&model)) - , grammar() , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {} @@ -2675,7 +2674,6 @@ struct llama_context { struct llama_cparams cparams; struct llama_sampling sampling; - struct llama_grammar grammar; struct llama_kv_cache kv_self; struct llama_control_vector cvec; @@ -14048,7 +14046,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { f = -INFINITY; } else { if (hparams.use_alibi) { - f = -fabs(lctx.kv_self.cells[i].pos - pos); + f = -std::abs(lctx.kv_self.cells[i].pos - pos); } else { f = 0.0f; } @@ -14102,7 +14100,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { for (int s = 0; s < batch.n_seq_id[i]; ++s) { if (batch.seq_id[i][s] == seq_id) { if (hparams.use_alibi) { - f = -fabs(batch.pos[i] - batch.pos[j]); + f = -std::abs(batch.pos[i] - batch.pos[j]); } else { f = 0.0f; } @@ -16833,10 +16831,6 @@ const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) { return &ctx->model.vocab; } -struct llama_grammar * llama_get_grammar(struct llama_context * ctx) { - return &ctx->grammar; -} - uint32_t llama_n_ctx(const struct llama_context * ctx) { return ctx->cparams.n_ctx; } From 49ce0ab6d45402e8bb622bf86f86529f2b0ba552 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Thu, 25 Jul 2024 22:23:05 +0100 Subject: [PATCH 04/21] ggml: handle ggml_init failure to fix NULL pointer deref (#8692) `ggml_init` can fail if no unused context is found. In that case, a NULL-pointer deref will happen later in the code during a call to `ggml_set_on_alloc`. This fixes it by bailing out if no context is found. --- ggml/src/ggml.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f65837e85..29afcc7f8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -21096,6 +21096,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p }; *params.ctx = ggml_init(pdata); + if (*params.ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } struct ggml_context * ctx_data = *params.ctx; From 41cd47caab88c442edc50e90c8d8d0ac3e82768d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 25 Jul 2024 23:49:39 +0200 Subject: [PATCH 05/21] examples : export-lora : fix issue with quantized base models (#8687) --- examples/export-lora/export-lora.cpp | 64 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 124ee167d..150f7e8d5 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -211,8 +211,9 @@ struct lora_merge_ctx { } } - // if true, this tensor can be lora-merged. if false, we skip merging and just copy data to outfile - std::vector> base_tensors; + // mapping base tensor to out tensor (same shape with base, but different type) + // if out_tensor == nullptr, we only copy it + std::vector> base_to_out_tensors; for (auto & it : base_model.tensors) { bool t_a = true; bool t_b = true; @@ -221,22 +222,22 @@ struct lora_merge_ctx { t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b"); } auto base_tensor = it.second; - struct ggml_tensor * out_tensor; if (!t_a && !t_b) { // only copy - out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - ggml_set_name(out_tensor, base_tensor->name); - base_tensors.push_back(std::make_pair(out_tensor, false)); + struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); + ggml_set_name(cpy_tensor, base_tensor->name); + base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr)); + gguf_add_tensor(ctx_out, cpy_tensor); } else if (t_a && t_b) { // need merging - out_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor); - out_tensor->type = get_out_tensor_type(base_tensor); + struct ggml_tensor * out_tensor = ggml_new_tensor( + ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne); ggml_set_name(out_tensor, base_tensor->name); - base_tensors.push_back(std::make_pair(out_tensor, true)); + base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor)); + gguf_add_tensor(ctx_out, out_tensor); } else { throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b"); } - gguf_add_tensor(ctx_out, out_tensor); } // placeholder for the meta data @@ -247,9 +248,9 @@ struct lora_merge_ctx { // process base model tensors size_t n_merged = 0; - for (auto & it : base_tensors) { - if (it.second) { - merge_tensor(it.first); + for (auto & it : base_to_out_tensors) { + if (it.second != nullptr) { + merge_tensor(it.first, it.second); n_merged++; } else { copy_tensor(it.first); @@ -265,7 +266,7 @@ struct lora_merge_ctx { } printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, base_tensors.size()); + printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -276,7 +277,7 @@ struct lora_merge_ctx { zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len); } - void merge_tensor(struct ggml_tensor * base) { + void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) { std::string name_base(base->name); std::string name_lora_a = name_base + ".lora_a"; std::string name_lora_b = name_base + ".lora_b"; @@ -287,14 +288,14 @@ struct lora_merge_ctx { std::vector inp_a(adapters.size()); std::vector inp_b(adapters.size()); struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead()*(1+adapters.size()*2), + /*.mem_size =*/ ggml_tensor_overhead()*(2+adapters.size()*2), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; struct ggml_context * ctx = ggml_init(params); // alloc tensors - struct ggml_tensor * inp = ggml_dup_tensor(ctx, base); + struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne); for (size_t i = 0; i < adapters.size(); ++i) { auto t_a = adapters[i]->get_tensor(name_lora_a); auto t_b = adapters[i]->get_tensor(name_lora_b); @@ -303,9 +304,21 @@ struct lora_merge_ctx { } ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); - // load data to backend buffer + // load base tensor to backend buffer base_model.read_tensor_data(name_base, read_buf); - ggml_backend_tensor_set(inp, read_buf.data(), 0, ggml_nbytes(inp)); + if (base->type != GGML_TYPE_F32) { + // optionally dequantize it + printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type)); + auto nels = ggml_nelements(inp_base); + ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type); + std::vector dequant_buf(nels * sizeof(float)); + qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels); + ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size()); + } else { + ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base)); + } + + // load lora tensors to backend buffer for (size_t i = 0; i < adapters.size(); ++i) { adapters[i]->read_tensor_data(name_lora_a, read_buf); ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i])); @@ -325,20 +338,21 @@ struct lora_merge_ctx { }; struct ggml_context * ctx0 = ggml_init(params0); gf = ggml_new_graph(ctx0); - struct ggml_tensor * cur = inp; + struct ggml_tensor * cur = inp_base; for (size_t i = 0; i < adapters.size(); ++i) { - struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, inp_a[i])); - struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, inp_b[i]); + struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))); + struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32)); // scale const float alpha = adapters[i]->alpha; const float rank = (float) inp_b[i]->ne[0]; const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; delta = ggml_scale(ctx0, delta, scale); - cur = ggml_add(ctx0, cur, delta); - printf("%s : + merging from adapter[%ld]\n", __func__, i); + cur = ggml_add(ctx0, delta, cur); + printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); } - cur = ggml_cast(ctx0, cur, get_out_tensor_type(base)); + cur = ggml_cast(ctx0, cur, out->type); + printf("%s : + output type is %s\n", __func__, ggml_type_name(out->type)); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); } From 01aec4a6310ab0160483196db0e726d78d4c94b6 Mon Sep 17 00:00:00 2001 From: Yaiko Date: Thu, 25 Jul 2024 18:10:16 -0400 Subject: [PATCH 06/21] server : add Speech Recognition & Synthesis to UI (#8679) * server : add Speech Recognition & Synthesis to UI * server : add Speech Recognition & Synthesis to UI (fixes) --- examples/server/public/index.html | 180 +++++++++++++++++++++++++++--- 1 file changed, 164 insertions(+), 16 deletions(-) diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 48628a960..07fec6a38 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -1,5 +1,4 @@ - @@ -132,12 +131,20 @@ align-items: stretch; } - .right { + .message-controls { display: flex; - flex-direction: row; - gap: 0.5em; justify-content: flex-end; } + .message-controls > div:nth-child(2) { + display: flex; + flex-direction: column; + gap: 0.5em; + } + .message-controls > div:nth-child(2) > div { + display: flex; + margin-left: auto; + gap: 0.5em; + } fieldset { border: none; @@ -276,6 +283,7 @@ import { llama } from './completion.js'; import { SchemaConverter } from './json-schema-to-grammar.mjs'; + let selected_image = false; var slot_id = -1; @@ -447,6 +455,9 @@ /* END: Support for storing prompt templates and parameters in browsers LocalStorage */ + const tts = window.speechSynthesis; + const ttsVoice = signal(null) + const llamaStats = signal(null) const controller = signal(null) @@ -596,8 +607,51 @@ }); } + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + const talkRecognition = SpeechRecognition ? new SpeechRecognition() : null; function MessageInput() { - const message = useSignal("") + const message = useSignal(""); + + const talkActive = useSignal(false); + const sendOnTalk = useSignal(false); + const talkStop = (e) => { + if (e) e.preventDefault(); + + talkActive.value = false; + talkRecognition?.stop(); + } + const talk = (e) => { + e.preventDefault(); + + if (talkRecognition) + talkRecognition.start(); + else + alert("Speech recognition is not supported by this browser."); + } + if(talkRecognition) { + talkRecognition.onstart = () => { + talkActive.value = true; + } + talkRecognition.onresult = (e) => { + if (event.results.length > 0) { + message.value = event.results[0][0].transcript; + if (sendOnTalk.value) { + submit(e); + } + } + } + talkRecognition.onspeechend = () => { + talkStop(); + } + } + + const ttsVoices = useSignal(tts?.getVoices() || []); + const ttsVoiceDefault = computed(() => ttsVoices.value.find(v => v.default)); + if (tts) { + tts.onvoiceschanged = () => { + ttsVoices.value = tts.getVoices(); + } + } const submit = (e) => { stop(e); @@ -624,11 +678,45 @@ value="${message}" /> -
- - - - +
+
+
+
+ + + + +
+ +
+ { + e.preventDefault(); + alert(`TTS supported by your browser: ${tts ? 'Yes' : 'No'}\n(TTS and speech recognition are not provided by llama.cpp)`); + }}>[?] + + +
+
` @@ -659,26 +747,86 @@ } }, [messages]) + const ttsChatLineActiveIx = useSignal(undefined); + const ttsChatLine = (e, ix, msg) => { + if (e) e.preventDefault(); + + if (!tts || !ttsVoice.value || !('SpeechSynthesisUtterance' in window)) return; + + const ttsVoices = tts.getVoices(); + const voice = ttsVoices.find(v => v.name === ttsVoice.value); + if (!voice) return; + + if (ttsChatLineActiveIx.value !== undefined) { + tts.cancel(); + if (ttsChatLineActiveIx.value === ix) { + ttsChatLineActiveIx.value = undefined; + return; + } + } + + ttsChatLineActiveIx.value = ix; + let ttsUtter = new SpeechSynthesisUtterance(msg); + ttsUtter.voice = voice; + ttsUtter.onend = e => { + ttsChatLineActiveIx.value = undefined; + }; + tts.speak(ttsUtter); + } + const isCompletionMode = session.value.type === 'completion' + + // Try play the last bot message + const lastCharChatLinesIxs = useSignal([]); + const lastCharChatLinesIxsOld = useSignal([]); + useEffect(() => { + if ( + !isCompletionMode + && lastCharChatLinesIxs.value.length !== lastCharChatLinesIxsOld.value.length + && !generating.value + ) { + const ix = lastCharChatLinesIxs.value[lastCharChatLinesIxs.value.length - 1]; + if (ix !== undefined) { + const msg = messages[ix]; + ttsChatLine(null, ix, Array.isArray(msg) ? msg[1].map(m => m.content).join('') : msg); + } + + lastCharChatLinesIxsOld.value = structuredClone(lastCharChatLinesIxs.value); + } + }, [generating.value]); + const chatLine = ([user, data], index) => { let message - const isArrayMessage = Array.isArray(data) + const isArrayMessage = Array.isArray(data); + const text = isArrayMessage ? + data.map(msg => msg.content).join('') : + data; if (params.value.n_probs > 0 && isArrayMessage) { message = html`<${Probabilities} data=${data} />` } else { - const text = isArrayMessage ? - data.map(msg => msg.content).join('') : - data; message = isCompletionMode ? text : html`<${Markdownish} text=${template(text)} />` } + + const fromBot = user && user === '{{char}}'; + if (fromBot && !lastCharChatLinesIxs.value.includes(index)) + lastCharChatLinesIxs.value.push(index); + if (user) { - return html`

${template(user)}: ${message}

` + return html` +
+

${template(user)}: ${message}

+ ${ + fromBot && ttsVoice.value + && html`
` + } +
+ `; } else { return isCompletionMode ? html`${message}` : - html`

${message}

` + html`

${message}

` } }; From 01245f5b1629075543bc4478418c7d72a0b4b3c7 Mon Sep 17 00:00:00 2001 From: Judd Date: Fri, 26 Jul 2024 16:38:12 +0800 Subject: [PATCH 07/21] llama : fix order of parameters (#8706) usage of `aclrtGetMemInfo` is correct: https://www.hiascend.com/doc_center/source/zh/canncommercial/63RC2/inferapplicationdev/aclcppdevg/aclcppdevg_03_0103.html Co-authored-by: Judd --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index bc0183741..77f7d32f8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2905,7 +2905,7 @@ static size_t llama_get_device_memory(const llama_model & model, int device) { #elif defined(GGML_USE_CANN) size_t total; size_t free; - ggml_backend_cann_get_device_memory(device, &total, &free); + ggml_backend_cann_get_device_memory(device, &free, &total); return free; #else return 1; From 2b1f616b208a4a21c4ee7a7eb85d822ff1d787af Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 27 Jul 2024 04:41:55 +0200 Subject: [PATCH 08/21] ggml : reduce hash table reset cost (#8698) * ggml : reduce hash table reset cost * fix unreachable code warnings after GGML_ASSERT(false) * GGML_ASSERT(false) -> GGML_ABORT("fatal error") * GGML_ABORT use format string --- Makefile | 6 +- examples/eval-callback/eval-callback.cpp | 2 +- examples/imatrix/imatrix.cpp | 4 +- examples/llama-bench/llama-bench.cpp | 6 +- examples/llava/clip.cpp | 2 +- examples/tokenize/tokenize.cpp | 2 +- ggml/include/ggml.h | 37 +- ggml/src/ggml-alloc.c | 42 +- ggml/src/ggml-backend.c | 214 +++--- ggml/src/ggml-blas.cpp | 3 +- ggml/src/ggml-cann.cpp | 6 +- ggml/src/ggml-cann/aclnn_ops.cpp | 26 +- ggml/src/ggml-cuda.cu | 6 +- ggml/src/ggml-cuda/argsort.cu | 2 +- ggml/src/ggml-cuda/binbcast.cu | 2 +- ggml/src/ggml-cuda/common.cuh | 2 +- ggml/src/ggml-cuda/cpy.cu | 4 +- ggml/src/ggml-cuda/dmmv.cu | 2 +- ggml/src/ggml-cuda/fattn-common.cuh | 6 +- ggml/src/ggml-cuda/fattn-tile-f16.cu | 2 +- ggml/src/ggml-cuda/fattn-tile-f32.cu | 2 +- ggml/src/ggml-cuda/fattn.cu | 10 +- ggml/src/ggml-cuda/getrows.cu | 3 +- ggml/src/ggml-cuda/mmq.cu | 2 +- ggml/src/ggml-cuda/mmq.cuh | 4 +- ggml/src/ggml-cuda/mmvq.cu | 6 +- ggml/src/ggml-cuda/quantize.cu | 2 +- ggml/src/ggml-cuda/rope.cu | 4 +- ggml/src/ggml-impl.h | 116 +++- ggml/src/ggml-kompute.cpp | 8 +- ggml/src/ggml-metal.m | 42 +- ggml/src/ggml-quants.c | 12 +- ggml/src/ggml-sycl.cpp | 16 +- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/dmmv.cpp | 2 +- ggml/src/ggml-sycl/dpct/helper.hpp | 2 +- ggml/src/ggml-sycl/mmq.cpp | 22 +- ggml/src/ggml-sycl/mmvq.cpp | 2 +- ggml/src/ggml-sycl/rope.cpp | 4 +- ggml/src/ggml-vulkan.cpp | 58 +- ggml/src/ggml.c | 838 +++++++++++------------ src/llama-grammar.cpp | 4 +- src/llama-vocab.cpp | 14 +- src/llama.cpp | 36 +- tests/test-backend-ops.cpp | 8 +- tests/test-sampling.cpp | 10 +- 46 files changed, 851 insertions(+), 754 deletions(-) diff --git a/Makefile b/Makefile index 8d2ccddc4..7e015af3e 100644 --- a/Makefile +++ b/Makefile @@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG endif else MK_CPPFLAGS += -DNDEBUG - MK_CFLAGS += -O3 - MK_CXXFLAGS += -O3 - MK_NVCCFLAGS += -O3 + MK_CFLAGS += -O3 -g + MK_CXXFLAGS += -O3 -g + MK_NVCCFLAGS += -O3 -g endif ifdef LLAMA_SANITIZE_THREAD diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index c8a3016a4..37d30ab8c 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne } else if (type == GGML_TYPE_I8) { v = (float) *(int8_t *) &data[i]; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } printf("%12.4f", v); sum += v; diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 574f5ed9c..6ce1863cf 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]*n_as) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } if (m_params.verbosity > 1) { printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); @@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } else if (e.values.size() != (size_t)src1->ne[0]) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); + exit(1); //GGML_ABORT("fatal error"); } ++e.ncall; if (m_params.verbosity > 1) { diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a6497b6e0..521fa8880 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) { case JSON: return "json"; case MARKDOWN: return "md"; case SQL: return "sql"; - default: GGML_ASSERT(!"invalid output format"); + default: GGML_ABORT("invalid output format"); } } @@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) { case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_LAYER: return "layer"; case LLAMA_SPLIT_MODE_ROW: return "row"; - default: GGML_ASSERT(!"invalid split mode"); + default: GGML_ABORT("invalid split mode"); } } @@ -1326,7 +1326,7 @@ static std::unique_ptr create_printer(output_formats format) { case SQL: return std::unique_ptr(new sql_printer()); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } int main(int argc, char ** argv) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index d23e282fb..7cda5f10c 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = peg_0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 2afb6024c..17f5e4961 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { printf(">"); return; } - GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); + GGML_ABORT("MultiByteToWideChar() failed in an unexpected way."); } LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 548661b9b..464d765da 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -254,18 +254,8 @@ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) -#define GGML_ASSERT(x) \ - do { \ - if (!(x)) { \ - fflush(stdout); \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - ggml_print_backtrace(); \ - abort(); \ - } \ - } while (0) - #ifndef NDEBUG -#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") +#define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) #elif defined(__GNUC__) #define GGML_UNREACHABLE() __builtin_unreachable() #elif defined(_MSC_VER) @@ -274,6 +264,17 @@ #define GGML_UNREACHABLE() ((void) 0) #endif +#ifdef __cplusplus +#define GGML_NORETURN [[noreturn]] +#elif defined(_MSC_VER) +#define GGML_NORETURN __declspec(noreturn) +#else +#define GGML_NORETURN _Noreturn +#endif + +#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) +#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) + // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability. // @@ -322,6 +323,9 @@ extern "C" { #endif + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) + GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); + enum ggml_status { GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_FAILED = -1, @@ -636,8 +640,11 @@ extern "C" { GGML_CGRAPH_EVAL_ORDER_COUNT }; + typedef uint32_t ggml_bitset_t; + struct ggml_hash_set { size_t size; + ggml_bitset_t * used; struct ggml_tensor ** keys; }; @@ -651,7 +658,7 @@ extern "C" { struct ggml_tensor ** grads; struct ggml_tensor ** leafs; - struct ggml_hash_set visited_hash_table; + struct ggml_hash_set visited_hash_set; enum ggml_cgraph_eval_order order; }; @@ -698,8 +705,6 @@ extern "C" { GGML_API int64_t ggml_cycles(void); GGML_API int64_t ggml_cycles_per_ms(void); - GGML_API void ggml_print_backtrace(void); - // accepts a UTF-8 path, even on Windows GGML_API FILE * ggml_fopen(const char * fname, const char * mode); @@ -2005,8 +2010,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index e176b883e..e485326ab 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); - GGML_ASSERT(!"not enough space in the buffer"); - return; + GGML_ABORT("not enough space in the buffer"); } void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; @@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, return; } } - GGML_ASSERT(!"out of allocated_tensors"); + GGML_ABORT("out of allocated_tensors"); } static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { @@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs return; } } - fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); - GGML_ASSERT(!"tensor not found"); + GGML_ABORT("tried to free tensor %s not found\n", tensor->name); } #endif @@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz // this should never happen fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", __func__, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); - GGML_UNREACHABLE(); + GGML_ABORT("not enough space in the buffer"); } } @@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { } } - free(galloc->hash_set.keys); + ggml_hash_set_free(&galloc->hash_set); free(galloc->hash_values); free(galloc->bufts); free(galloc->buffers); @@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) { typedef struct ggml_gallocr * ggml_gallocr_t; static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { - size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); + size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); return &galloc->hash_values[i]; } @@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { // clear hash tables - memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); - memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + ggml_hash_set_reset(&galloc->hash_set); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); // allocate leafs // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes @@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { - size_t hash_size = graph->visited_hash_table.size; + size_t min_hash_size = graph->n_nodes + graph->n_leafs; + // add 25% margin to avoid hash collisions + min_hash_size += min_hash_size / 4; // initialize hash table - if (galloc->hash_set.size < hash_size) { - free(galloc->hash_set.keys); - free(galloc->hash_values); - galloc->hash_set.size = hash_size; - galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *)); - galloc->hash_values = calloc(hash_size, sizeof(struct hash_node)); + if (galloc->hash_set.size < min_hash_size) { + ggml_hash_set_free(&galloc->hash_set); + galloc->hash_set = ggml_hash_set_new(min_hash_size); GGML_ASSERT(galloc->hash_set.keys != NULL); + + free(galloc->hash_values); + galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); GGML_ASSERT(galloc->hash_values != NULL); - } else { - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } // reset allocators @@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * } static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { - ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; - size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); return talloc->size_max >= node_size; } diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index d39cfed88..954ab2072 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -1055,11 +1055,10 @@ struct ggml_backend_sched { ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_gallocr_t galloc; - // hash keys of the nodes in the graph - struct ggml_hash_set hash_set; - // hash values - int * tensor_backend_id; - struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES]; + // hash map of the nodes in the graph + struct ggml_hash_set hash_set; + int * hv_tensor_backend_ids; // [hash_set.size] + struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies] int * node_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size] @@ -1068,7 +1067,7 @@ struct ggml_backend_sched { int * prev_leaf_backend_ids; // [graph_size] // copy of the graph with modified inputs - struct ggml_cgraph * graph; + struct ggml_cgraph graph; // graph splits struct ggml_backend_sched_split * splits; @@ -1087,19 +1086,16 @@ struct ggml_backend_sched { ggml_backend_sched_eval_callback callback_eval; void * callback_eval_user_data; - bool debug; + char * context_buffer; + size_t context_buffer_size; - // align context_buffer to GGML_MEM_ALIGN -#ifdef _MSC_VER - __declspec(align(GGML_MEM_ALIGN)) -#else - __attribute__((aligned(GGML_MEM_ALIGN))) -#endif - char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; + bool debug; }; -#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor) -#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)] +#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor) +#define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)] +#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)] +#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id) // returns the priority of the backend, lower id is higher priority static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { @@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st return cur_backend_id; } - // assign nodes that use weights to the backend of the weights // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; @@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->is_reset = false; struct ggml_init_params params = { - /* .mem_size = */ sizeof(sched->context_buffer), + /* .mem_size = */ sched->context_buffer_size, /* .mem_buffer = */ sched->context_buffer, /* .no_alloc = */ true }; @@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->ctx = ggml_init(params); if (sched->ctx == NULL) { - fprintf(stderr, "%s: failed to initialize context\n", __func__); - GGML_ASSERT(false); + GGML_ABORT("%s: failed to initialize context\n", __func__); } // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; int * leaf_backend_id = &tensor_backend_id(leaf); - if (*leaf_backend_id != -1) { - // do not overwrite user assignments - continue; + // do not overwrite user assignments + if (*leaf_backend_id == -1) { + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } - *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; int * node_backend_id = &tensor_backend_id(node); - if (*node_backend_id != -1) { - // do not overwrite user assignments - continue; - } - *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); - // src - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + // do not overwrite user assignments + if (*node_backend_id == -1) { + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); + +#if 0 + // src + if (node->op == GGML_OP_NONE) { continue; } - int * src_backend_id = &tensor_backend_id(src); - if (*src_backend_id == -1) { - *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); + } } +#endif } } @@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - // pass 4: split graph, find tensors that need to be copied + // pass 5: split graph, find tensors that need to be copied { int i_split = 0; struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops - for (int i = 0; i < graph->n_nodes; i++) { + int i = 0; + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { split->backend_id = tensor_backend_id(node); @@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } split->i_start = 0; split->n_inputs = 0; - memset(split->inputs, 0, sizeof(split->inputs)); //HACK int cur_backend_id = split->backend_id; - for (int i = 0; i < graph->n_nodes; i++) { + for (; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { @@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now + assert(node_backend_id != -1); // all nodes should be assigned by now // check if we should start a new split based on the sources of the current node bool need_new_split = false; @@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // by starting a new split, the memory of the previously offloaded weights can be reused if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = tensor_backend_id(src); - if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + if (src_backend_id != cur_backend_id) { need_new_split = true; break; } @@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // FIXME: count the number of inputs instead of only checking when full if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { const size_t id = hash_id(src); - int src_backend_id = sched->tensor_backend_id[id]; + int src_backend_id = sched->hv_tensor_backend_ids[id]; bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { + if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) { //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); need_new_split = true; break; @@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - const int src_backend_id = tensor_backend_id(src); + size_t src_id = hash_id(src); + const int src_backend_id = sched->hv_tensor_backend_ids[src_id]; assert(src_backend_id != -1); // all inputs should be assigned by now if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { - size_t id = hash_id(src); - if (sched->tensor_copies[id][src_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy; @@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][src_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, src_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); - if (src_backend_id != cur_backend_id && !supported) { + if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) { // create a copy of the input in the split's backend - const size_t id = hash_id(src); - if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { + if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_input(tensor_copy); ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } - sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; + tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy; SET_CAUSE(tensor_copy, "4.cpy"); } int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); split->inputs[n_inputs] = src; } - node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; + node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy); } } } @@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_backend_sched_print_assignments(sched, graph); } - // swap node_backend_ids and leaf_backend_ids and prevs + // swap node_backend_ids and leaf _backend_ids with prevs { int * tmp = sched->node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids; @@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->prev_leaf_backend_ids = tmp; } - // create copies of the graph for each split - // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); + int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2; + if (sched->graph.size < graph_size) { + sched->graph.size = graph_size; + sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *)); + sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *)); + GGML_ASSERT(sched->graph.nodes != NULL); + GGML_ASSERT(sched->graph.leafs != NULL); + } + sched->graph.n_nodes = 0; + sched->graph.n_leafs = 0; + + struct ggml_cgraph * graph_copy = &sched->graph; + for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); @@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; const size_t input_id = hash_id(input); - struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy); // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; + sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg size_t id = hash_id(input); int backend_id = tensor_backend_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg struct ggml_tensor * input = split->inputs[j]; size_t id = hash_id(input); for (int c = 0; c < sched->n_copies; c++) { - struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; + struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c); sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; } @@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); graph_copy->leafs[graph_copy->n_leafs++] = leaf; } - - sched->graph = graph_copy; } static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { bool backend_ids_changed = false; - for (int i = 0; i < sched->graph->n_nodes; i++) { + for (int i = 0; i < sched->graph.n_nodes; i++) { if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { backend_ids_changed = true; @@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } } if (!backend_ids_changed) { - for (int i = 0; i < sched->graph->n_leafs; i++) { + for (int i = 0; i < sched->graph.n_leafs; i++) { if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { backend_ids_changed = true; @@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { } // allocate graph - if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { // the re-allocation may cause the split inputs to be moved to a different address ggml_backend_sched_synchronize(sched); #ifndef NDEBUG - fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__); + fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed); #endif - ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); - if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) { fprintf(stderr, "%s: failed to allocate graph\n", __func__); return false; } @@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s for (int j = 0; j < split->n_inputs; j++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy]; + struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy); if (input->flags & GGML_TENSOR_FLAG_INPUT) { // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done @@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; + sched->n_backends = n_backends; + sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size); - sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); - sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); + // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead) + sched->hash_set = ggml_hash_set_new(graph_size); + sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); - sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); + sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); - sched->n_backends = n_backends; - - sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; + sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false); + sched->context_buffer = malloc(sched->context_buffer_size); const int initial_splits_capacity = 16; sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); @@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + ggml_hash_set_free(&sched->hash_set); free(sched->splits); - free(sched->hash_set.keys); - free(sched->tensor_backend_id); - free(sched->tensor_copies); + free(sched->hv_tensor_backend_ids); + free(sched->hv_tensor_copies); free(sched->node_backend_ids); free(sched->leaf_backend_ids); free(sched->prev_node_backend_ids); free(sched->prev_leaf_backend_ids); + free(sched->context_buffer); + free(sched->graph.nodes); + free(sched->graph.leafs); free(sched); } void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run if (!sched->is_reset) { - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - + ggml_hash_set_reset(&sched->hash_set); + memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0])); + memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *)); sched->is_reset = true; } sched->is_alloc = false; } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); ggml_backend_sched_split_graph(sched, measure_graph); - // TODO: extract this to a separate function - if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { + if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) { return false; } @@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs); ggml_backend_sched_split_graph(sched, graph); + if (!ggml_backend_sched_alloc_splits(sched)) { return false; } @@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); tensor_backend_id(node) = backend_index; SET_CAUSE(node, "usr"); + sched->is_reset = false; } ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { @@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, GGML_ASSERT(src != NULL); GGML_ASSERT(src->data && "graph must be allocated"); - size_t id = ggml_hash_insert(hash_set, src); - if (id == GGML_HASHTABLE_ALREADY_EXISTS) { - return node_copies[ggml_hash_find(hash_set, src)]; + size_t id = ggml_hash_insert(&hash_set, src); + if (id == GGML_HASHSET_ALREADY_EXISTS) { + return node_copies[ggml_hash_find(&hash_set, src)]; } struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); @@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, return dst; } -static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { +static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { - struct ggml_hash_set hash_set = { - /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT - }; + struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size); struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); @@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s if (ctx_allocated == NULL || ctx_unallocated == NULL) { fprintf(stderr, "failed to allocate context for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); if (buffer == NULL) { fprintf(stderr, "failed to allocate buffer for graph copy\n"); - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); ggml_free(ctx_allocated); @@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_copy_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(&hash_set, node_copies, node_init, node); } // build graph copy struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)]; + struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)]; graph_copy->nodes[i] = node_copy; } graph_copy->n_nodes = graph->n_nodes; - free(hash_set.keys); + ggml_hash_set_free(&hash_set); free(node_copies); free(node_init); diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp index a37aa4072..713731735 100644 --- a/ggml/src/ggml-blas.cpp +++ b/ggml/src/ggml-blas.cpp @@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t break; default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); } } diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index 9bf7e332a..ad5feea05 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) { file, line); GGML_CANN_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CANN error"); + GGML_ABORT("CANN error"); } /** @@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool { // memory should always buffered. these memory may still needed by // tasks in stream. // TODO, fix me. - GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); } }; @@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend, ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent)event->context)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index a02efc828..f27666970 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_max_pool2d(ctx, dst); break; case GGML_OP_POOL_COUNT: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (dst->type == GGML_TYPE_F32) { if (ggml_are_same_shape(src, dst)) { @@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (src->type == GGML_TYPE_F32) { // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size // && nb0 == type_size) @@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { // TODO: dst not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } if (dst->type == GGML_TYPE_F16) { @@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } // TODO - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { if (ggml_are_same_shape(src, dst)) { cann_copy(ctx, acl_src, acl_dst); @@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(acl_dst)); return; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ((ggml_tensor*)dst->extra)->nb); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_mul_mat_q8_0(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index e48269e46..54ccf6bb1 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CUDA error"); + GGML_ABORT("CUDA error"); } // this is faster on Windows @@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (quantize_src1 && !src1_is_contiguous) { @@ -2945,7 +2945,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); #endif - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 15757ca18..607ded855 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co } else if (order == GGML_SORT_ORDER_DESC) { k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 19b08b74f..34bc67acd 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast( } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1c2e00c1e..eac026f47 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -348,7 +348,7 @@ static __device__ void no_device_code( #ifdef __CUDA_ARCH__ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) #else -#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.") #endif // __CUDA_ARCH__ static __device__ __forceinline__ float warp_reduce_sum(float x) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 3db57034b..aad34bfe5 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { } else { fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/dmmv.cu b/ggml/src/ggml-cuda/dmmv.cu index 174489e06..d7a2a2513 100644 --- a/ggml/src/ggml-cuda/dmmv.cu +++ b/ggml/src/ggml-cuda/dmmv.cu @@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec( convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index f24312dd0..950fd93df 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); fprintf(stderr, "By default only f16 KV cache is supported.\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else if (D == 128) { fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); fprintf(stderr, "Supported combinations:\n"); @@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) { fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); fprintf(stderr, "Only f16 is supported.\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index c6c35134d..1b2fd500b 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index 15e22f495..f3e68dbfa 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * launch_fattn(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); } break; default: { - GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128."); } break; } } diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d30b210..29f608b0f 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } else { @@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); // break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } return; @@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 55af195fd..4c3703238 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { break; default: // TODO: k-quants - fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); break; } } diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 84f6387e2..78d70cd7a 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q( mul_mat_q_case(ctx, args, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index f08a4758d..e8a957447 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) { case GGML_TYPE_IQ4_NL: return MMQ_Q8_1_DS_LAYOUT_D4; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda break; default: fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index e22faf69b..7dbbc9939 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda( rows_per_cuda_block = 2; break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda( mul_mat_vec_q<<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index aa7f1eff0..45408ce86 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda( <<>>(x, vy, kx0, kx1, kx0_padded); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 596fb7c13..99ec1dd98 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { attn_factor, corr_dims, freq_factors, stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index a2c8dbec0..7f7afdbfc 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif -#define GGML_HASHTABLE_FULL ((size_t)-1) -#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +// bitset + +static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); +#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) +#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) + +static size_t ggml_bitset_size(size_t n) { + return (n + BITSET_MASK) >> BITSET_SHR; +} + +static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) { + return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK))); +} + +static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK)); +} + +static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { + bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK)); +} + +// hash set + +#define GGML_HASHSET_FULL ((size_t)-1) +#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) struct ggml_hash_set ggml_hash_set_new(size_t size); +void ggml_hash_set_free(struct ggml_hash_set * hash_set); -bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns the minimum size for a hash set that can hold min_sz elements +size_t ggml_hash_size(size_t min_sz); -// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted -size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); +// remove all elements from the hash set +void ggml_hash_set_reset(struct ggml_hash_set * hash_set); -// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full -size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); +// returns true if key is in the hash set +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); // return index, asserts if table is full -size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); + +// hash function for ggml_tensor +static inline size_t ggml_hash(const struct ggml_tensor * p) { + // the last 4 bits are always zero due to alignment + return (size_t)(uintptr_t)p >> 4; +} + +static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) { + i = (i + 1) % hash_set->size; + if (i == h) { + // visited all hash table entries -> not found + return GGML_HASHSET_FULL; + } + } + return i; +} + +static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t i = ggml_hash_find(hash_set, key); + return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i); +} + +static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return GGML_HASHSET_ALREADY_EXISTS; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} + +static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { + size_t h = ggml_hash(key) % hash_set->size; + + // linear probing + size_t i = h; + do { + if (!ggml_bitset_get(hash_set->used, i)) { + ggml_bitset_set(hash_set->used, i); + hash_set->keys[i] = key; + return i; + } + if (hash_set->keys[i] == key) { + return i; + } + i = (i + 1) % hash_set->size; + } while (i != h); + + // visited all hash table entries -> not found + GGML_ABORT("fatal error"); +} #ifdef __cplusplus } diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp index ed5f2e349..41ac63fa4 100644 --- a/ggml/src/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute.cpp @@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) { } if ((a % b) != 0) { fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); - GGML_ASSERT(!"safe_divide result would've had remainder"); + GGML_ABORT("safe_divide result would've had remainder"); } return a / b; } @@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml if (!ggml_vk_supports_op(dst)) { fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } const int32_t ne00 = src0 ? src0->ne[0] : 0; @@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml default: { fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } break; @@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml continue; not_implemented: {} fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } // Evaluate sequence diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index a7619bcca..48b813131 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute( NSError * error = nil; if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); - GGML_ASSERT(!"capture failed"); + GGML_ABORT("capture failed"); } } @@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute( if (!ggml_metal_supports_op(ctx, dst)) { GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); - GGML_ASSERT(!"unsupported op"); + GGML_ABORT("unsupported op"); } if (should_capture) { @@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } bcast_row = true; @@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } } @@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); } [encoder setComputePipelineState:pipeline]; @@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_SQR: @@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + default: GGML_ABORT("MUL MAT-MAT not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; - default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); + default: GGML_ABORT("MUL_MAT_ID not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); - GGML_ASSERT(false && "not implemented"); + GGML_ABORT("not implemented"); } }; @@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute( switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } else { switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; } @@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (dst->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (order) { case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; [encoder setComputePipelineState:pipeline]; @@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } else { @@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute( { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("add template specialization for this size\n"); - GGML_ASSERT(false && "add template specialization for this size"); + GGML_ABORT("add template specialization for this size"); } } } @@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; case GGML_TYPE_F16: @@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute( switch (dstt) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); }; } break; - default: GGML_ASSERT(false && "not implemented"); + default: GGML_ABORT("not implemented"); } [encoder setComputePipelineState:pipeline]; @@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute( default: { GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 47418597c..1c6c85aac 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -12692,7 +12692,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+1] |= (block_signs[k] << 7*k); @@ -12871,7 +12871,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } q2[2*ib+k] = grid_index | (block_signs[k] << 9); } @@ -13314,7 +13314,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (grid_size == 256) { q3[8*ib+k] = grid_index; @@ -13527,7 +13527,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } qs[k] = grid_index & 255; qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); @@ -14503,7 +14503,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy printf("Oops: found point %u not on grid:", u); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); printf("\n"); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int i8 = 2*ib + k; y[ibl].qs[i8] = grid_index & 255; @@ -14623,7 +14623,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte } if (nbytes % ggml_type_size(type) != 0) { - fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); + fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type)); return false; } diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 36518ff93..7cb07d0dc 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, }); }); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, // GGML_SYCL_DEBUG("current device index %d\n", id); src_ptr = (char *) extra->data_device[id]; } else { - // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n"); - GGML_ASSERT(false); + // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n"); + GGML_ABORT("fatal error"); } char * dst_ptr = (char *) dst; @@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te default: // TODO: k-quants fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } @@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::arraytype), ggml_type_name(src1->type)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } (void) dst; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 397bd98dd..86d8b40e8 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -100,7 +100,7 @@ static void crash() { const char* msg) { fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); fprintf(stderr, " in function %s at %s:%d\n", func, file, line); - GGML_ASSERT(!"SYCL error"); + GGML_ABORT("SYCL error"); } #define SYCL_CHECK(err) \ diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 70a94fc16..ae45630e1 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( break; default: printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 4aaa76bfb..ef4609e32 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -975,7 +975,7 @@ namespace dpct if (backend == "opencl:cpu") return 4; if (backend == "opencl:acc") return 5; printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } static bool compare_backend(std::string &backend1, std::string &backend2) { return convert_backend_index(backend1) < convert_backend_index(backend2); diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index 3107ba919..e952533d3 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_0_PASCAL; nwarps = NWARPS_Q4_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_1_PASCAL; nwarps = NWARPS_Q4_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_0_PASCAL; nwarps = NWARPS_Q5_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_1_PASCAL; nwarps = NWARPS_Q5_1_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q8_0_PASCAL; nwarps = NWARPS_Q8_0_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q2_K_PASCAL; nwarps = NWARPS_Q2_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q3_K_PASCAL; nwarps = NWARPS_Q3_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q4_K_PASCAL; nwarps = NWARPS_Q4_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q5_K_PASCAL; nwarps = NWARPS_Q5_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, mmq_y = MMQ_Y_Q6_K_PASCAL; nwarps = NWARPS_Q6_K_PASCAL; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; @@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q( ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index 3fbc4dd60..23232357e 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q( mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); break; } } diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 6f507941a..c7545bcc1 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -251,7 +251,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { if (src0->type == GGML_TYPE_F32) { @@ -265,7 +265,7 @@ void ggml_sycl_op_rope( attn_factor, corr_dims, freq_factors, main_stream ); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 6bcd81a7b..74991f6d1 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() { // Make sure at least one device exists if (devices.empty()) { std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Default to using all dedicated GPUs @@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf; @@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont staging = ctx->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s // Buffer is already mapped if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Check if src is pinned memory vk_buffer buf = nullptr; @@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s staging_buffer = dst->device->sync_staging; staging_offset = 0; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si staging_buffer = src->device->sync_staging; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_ } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { @@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (mmp == nullptr) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // Not implemented @@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c std::cerr << " and " << ggml_type_name(src1->type); } std::cerr << " to " << ggml_type_name(dst->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } op_func(ctx, subctx, src0, src1, dst); @@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 } else if (type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_s; shname = "F16_ALIGNED_S"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { @@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_m; shname = "F16_ALIGNED_M"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { @@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t p = ctx->device->pipeline_matmul_f16->a_l; shname = "F16_ALIGNED_L"; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else { GGML_ASSERT(0); @@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { @@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t } else if (std::is_same()) { src0_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (std::is_same()) { src1_type = GGML_TYPE_F32; } else if (std::is_same()) { src1_type = GGML_TYPE_F16; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); @@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { std::cerr << std::endl; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { @@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return; } @@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d } else if (tensor->type == GGML_TYPE_I32) { val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } fprintf(stderr, "% 7.2f ", val); } else { @@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { @@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * break; default: std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { @@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); @@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * } } else { std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { @@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) { first_error[0] = i0; @@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * std::cerr << std::endl; std::vector done; ggml_vk_print_graph_origin(tensor, done); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } else { std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 29afcc7f8..c196fd5bf 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -141,23 +141,25 @@ typedef pthread_t ggml_thread_t; #include -void ggml_print_backtrace(void) { - /* - #include - #include - +#if defined(__linux__) +#include +static void ggml_print_backtrace_symbols(void) { void * trace[100]; - int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0])); - backtrace_symbols_fd(trace, nptrs, STDERR_FILENO); - */ +} +#else +static void ggml_print_backtrace_symbols(void) { + // platform not supported +} +#endif - // backtrack_symbols does not show line numbers, use gdb instead +static void ggml_print_backtrace(void) { char attach[32]; snprintf(attach, sizeof(attach), "attach %d", getpid()); int pid = fork(); if (pid == 0) { + // try gdb execlp("gdb", "gdb", "--batch", "-ex", "set style enabled on", "-ex", attach, @@ -165,16 +167,46 @@ void ggml_print_backtrace(void) { "-ex", "detach", "-ex", "quit", (char *) NULL); + // try lldb + execlp("lldb", "lldb", "--batch", + "-o", "bt", + "-o", "quit", + "-p", attach, + (char *) NULL); + exit(EXIT_FAILURE); } else { - waitpid(pid, NULL, 0); + int wstatus; + waitpid(pid, &wstatus, 0); + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) == EXIT_FAILURE) { + // gdb failed, fallback to backtrace_symbols + ggml_print_backtrace_symbols(); + } + } } } #else -void ggml_print_backtrace(void) { +static void ggml_print_backtrace(void) { // platform not supported } #endif +void ggml_abort(const char * file, int line, const char * fmt, ...) { + fflush(stdout); + + fprintf(stderr, "%s:%d: ", file, line); + + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + va_end(args); + + fprintf(stderr, "\n"); + + ggml_print_backtrace(); + abort(); +} + #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -246,7 +278,7 @@ inline static void * ggml_aligned_malloc(size_t size) { break; } GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return NULL; } return aligned_memory; @@ -267,7 +299,7 @@ inline static void * ggml_malloc(size_t size) { void * result = malloc(size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -281,7 +313,7 @@ inline static void * ggml_calloc(size_t num, size_t size) { void * result = calloc(num, size); if (result == NULL) { GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0)); - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return result; } @@ -3372,7 +3404,7 @@ static inline int ggml_up(int n, int m) { } // assert that pointer is aligned to GGML_MEM_ALIGN -#define ggml_assert_aligned(ptr) \ +#define GGML_ASSERT_ALIGNED(ptr) \ GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0) //////////////////////////////////////////////////////////////////////////////// @@ -3473,7 +3505,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT(ctx->mem_buffer != NULL); - ggml_assert_aligned(ctx->mem_buffer); + GGML_ASSERT_ALIGNED(ctx->mem_buffer); GGML_PRINT_DEBUG("%s: context initialized\n", __func__); @@ -3605,7 +3637,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml .type = type, }; - ggml_assert_aligned(mem_buffer + obj_new->offs); + GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs); if (obj_cur != NULL) { obj_cur->next = obj_new; @@ -3706,7 +3738,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( #endif // TODO: this should not be needed as long as we don't rely on aligned SIMD loads - //ggml_assert_aligned(result->data); + //GGML_ASSERT_ALIGNED(result->data); for (int i = 0; i < n_dims; i++) { result->ne[i] = ne[i]; @@ -3879,8 +3911,8 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -3938,8 +3970,8 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } return tensor; @@ -4008,11 +4040,9 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { @@ -4055,8 +4085,8 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4076,10 +4106,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value) { @@ -4111,8 +4139,8 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4149,11 +4177,9 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } default: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } - - return 0.0f; } void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { @@ -4190,8 +4216,8 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4211,10 +4237,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_F32: return ((float *) data)[0]; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } - - return 0.0f; } void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value) { @@ -4246,8 +4270,8 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -4270,8 +4294,11 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) { } struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { - strncpy(tensor->name, name, sizeof(tensor->name) - 1); - tensor->name[sizeof(tensor->name) - 1] = '\0'; + size_t i; + for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) { + tensor->name[i] = name[i]; + } + tensor->name[i] = '\0'; return tensor; } @@ -4842,7 +4869,7 @@ struct ggml_tensor * ggml_mean( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -4865,7 +4892,7 @@ struct ggml_tensor * ggml_argmax( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); is_node = true; } @@ -5188,7 +5215,7 @@ static struct ggml_tensor * ggml_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -5291,7 +5318,7 @@ static struct ggml_tensor * ggml_group_norm_impl( bool is_node = false; if (!inplace && (a->grad)) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -5705,7 +5732,7 @@ struct ggml_tensor * ggml_reshape( if (b->grad) { // gradient propagation is not supported - //GGML_ASSERT(false); + //GGML_ABORT("fatal error"); } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0); @@ -6488,7 +6515,7 @@ struct ggml_tensor * ggml_clamp( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6564,7 +6591,7 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6636,7 +6663,7 @@ struct ggml_tensor * ggml_im2col( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6722,7 +6749,7 @@ struct ggml_tensor * ggml_conv_transpose_2d_p0( bool is_node = false; if (a->grad || b->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6763,7 +6790,7 @@ struct ggml_tensor * ggml_pool_1d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6801,7 +6828,7 @@ struct ggml_tensor * ggml_pool_2d( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6834,7 +6861,7 @@ static struct ggml_tensor * ggml_upscale_impl( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6884,7 +6911,7 @@ struct ggml_tensor * ggml_pad( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -6933,7 +6960,7 @@ struct ggml_tensor * ggml_timestep_embedding( bool is_node = false; if (timesteps->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7059,7 +7086,7 @@ struct ggml_tensor * ggml_flash_attn_back( struct ggml_tensor * v, struct ggml_tensor * d, bool masked) { - GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes"); + GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes"); GGML_ASSERT(ggml_can_mul_mat(k, q)); // TODO: check if vT can be multiplied by (k*qT) @@ -7158,7 +7185,7 @@ struct ggml_tensor * ggml_ssm_conv( bool is_node = false; if (s->grad || x->grad || c->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7212,7 +7239,7 @@ struct ggml_tensor * ggml_ssm_scan( bool is_node = false; if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement is_node = true; } @@ -7244,7 +7271,7 @@ struct ggml_tensor * ggml_win_part( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7282,7 +7309,7 @@ struct ggml_tensor * ggml_win_unpart( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -7312,7 +7339,7 @@ struct ggml_tensor * ggml_get_rel_pos( bool is_node = false; if (a->grad) { - GGML_ASSERT(false); // TODO: implement backward + GGML_ABORT("fatal error"); // TODO: implement backward is_node = true; } @@ -8002,7 +8029,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8044,7 +8071,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8161,7 +8188,7 @@ static void ggml_compute_forward_dup_f16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8288,7 +8315,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8348,7 +8375,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } return; @@ -8517,7 +8544,7 @@ static void ggml_compute_forward_dup_bf16( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8603,7 +8630,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } else { //printf("%s: this is not optimal - fix me\n", __func__); @@ -8663,7 +8690,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -8834,7 +8861,7 @@ static void ggml_compute_forward_dup_f32( } } } else { - GGML_ASSERT(false); // TODO: implement + GGML_ABORT("fatal error"); // TODO: implement } } @@ -9012,8 +9039,8 @@ static void ggml_compute_forward_dup( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9165,7 +9192,7 @@ static void ggml_compute_forward_add_f16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9240,7 +9267,7 @@ static void ggml_compute_forward_add_bf16_f32( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9292,7 +9319,7 @@ static void ggml_compute_forward_add_f16_f16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9344,7 +9371,7 @@ static void ggml_compute_forward_add_bf16_bf16( } else { // src1 is not contiguous - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -9438,7 +9465,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_F16: @@ -9450,7 +9477,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -9462,7 +9489,7 @@ static void ggml_compute_forward_add( ggml_compute_forward_add_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -9492,8 +9519,8 @@ static void ggml_compute_forward_add( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9827,7 +9854,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_f16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_BF16: @@ -9839,7 +9866,7 @@ static void ggml_compute_forward_add1( ggml_compute_forward_add1_bf16_f32(params, dst); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_TYPE_Q4_0: @@ -9870,8 +9897,8 @@ static void ggml_compute_forward_add1( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -9995,8 +10022,8 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10076,8 +10103,8 @@ static void ggml_compute_forward_sub( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10170,8 +10197,8 @@ static void ggml_compute_forward_mul( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10261,8 +10288,8 @@ static void ggml_compute_forward_div( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10306,8 +10333,8 @@ static void ggml_compute_forward_sqr( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10351,8 +10378,8 @@ static void ggml_compute_forward_sqrt( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10396,8 +10423,8 @@ static void ggml_compute_forward_log( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10525,8 +10552,8 @@ static void ggml_compute_forward_sum( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10578,8 +10605,8 @@ static void ggml_compute_forward_sum_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10635,8 +10662,8 @@ static void ggml_compute_forward_mean( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10683,8 +10710,8 @@ static void ggml_compute_forward_argmax( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10801,8 +10828,8 @@ static void ggml_compute_forward_repeat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10879,8 +10906,8 @@ static void ggml_compute_forward_repeat_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10948,8 +10975,8 @@ static void ggml_compute_forward_concat( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -10992,8 +11019,8 @@ static void ggml_compute_forward_abs( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11036,8 +11063,8 @@ static void ggml_compute_forward_sgn( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11080,8 +11107,8 @@ static void ggml_compute_forward_neg( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11124,8 +11151,8 @@ static void ggml_compute_forward_step( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11168,8 +11195,8 @@ static void ggml_compute_forward_tanh( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11212,8 +11239,8 @@ static void ggml_compute_forward_elu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11256,8 +11283,8 @@ static void ggml_compute_forward_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11300,8 +11327,8 @@ static void ggml_compute_forward_sigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11359,8 +11386,8 @@ static void ggml_compute_forward_gelu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11418,8 +11445,8 @@ static void ggml_compute_forward_gelu_quick( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11477,8 +11504,8 @@ static void ggml_compute_forward_silu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } // ggml_compute_forward_leaky_relu @@ -11526,8 +11553,8 @@ static void ggml_compute_forward_leaky_relu( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11589,8 +11616,8 @@ static void ggml_compute_forward_silu_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11631,8 +11658,8 @@ static void ggml_compute_forward_hardswish( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11673,8 +11700,8 @@ static void ggml_compute_forward_hardsigmoid( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11745,8 +11772,8 @@ static void ggml_compute_forward_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11813,8 +11840,8 @@ static void ggml_compute_forward_rms_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -11986,8 +12013,8 @@ static void ggml_compute_forward_rms_norm_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12080,8 +12107,8 @@ static void ggml_compute_forward_group_norm( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12839,17 +12866,17 @@ static void ggml_compute_forward_out_prod( } break; case GGML_TYPE_F16: { - GGML_ASSERT(false); // todo + GGML_ABORT("fatal error"); // todo // ggml_compute_forward_out_prod_f16_f32(params, dst); - } break; + } case GGML_TYPE_F32: { ggml_compute_forward_out_prod_f32(params, dst); } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -12908,8 +12935,8 @@ static void ggml_compute_forward_scale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13024,8 +13051,8 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q4_0_8_8: default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13302,8 +13329,8 @@ static void ggml_compute_forward_get_rows( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13410,8 +13437,8 @@ static void ggml_compute_forward_get_rows_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } //static bool first = true; @@ -13488,8 +13515,8 @@ static void ggml_compute_forward_diag( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13558,8 +13585,8 @@ static void ggml_compute_forward_diag_mask_inf( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13576,8 +13603,8 @@ static void ggml_compute_forward_diag_mask_zero( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13694,8 +13721,8 @@ static void ggml_compute_forward_soft_max( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13790,8 +13817,8 @@ static void ggml_compute_forward_soft_max_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -13881,8 +13908,8 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_F64: case GGML_TYPE_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14211,8 +14238,8 @@ static void ggml_compute_forward_rope( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14235,8 +14262,8 @@ static void ggml_compute_forward_rope_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14435,8 +14462,8 @@ static void ggml_compute_forward_conv_transpose_1d( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14607,8 +14634,8 @@ static void ggml_compute_forward_im2col( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14740,20 +14767,20 @@ static void ggml_compute_forward_pool_1d_sk_p0( switch (op) { case GGML_OP_POOL_AVG: drow[i] = 0; break; case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { switch (op) { case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } ++j; } switch (op) { case GGML_OP_POOL_AVG: drow[i] /= k; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } @@ -14822,7 +14849,7 @@ static void ggml_compute_forward_pool_2d( switch (op) { case GGML_OP_POOL_AVG: *out = 0; break; case GGML_OP_POOL_MAX: *out = -FLT_MAX; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } const int ix = offset0 + ox * s0; @@ -14837,14 +14864,14 @@ static void ggml_compute_forward_pool_2d( switch (op) { case GGML_OP_POOL_AVG: *out += srow[j]; break; case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } switch (op) { case GGML_OP_POOL_AVG: *out /= ka; break; case GGML_OP_POOL_MAX: break; - case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } @@ -14908,8 +14935,8 @@ static void ggml_compute_forward_upscale( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -14966,8 +14993,8 @@ static void ggml_compute_forward_pad( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15007,8 +15034,8 @@ static void ggml_compute_forward_arange( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15058,8 +15085,8 @@ static void ggml_compute_forward_timestep_embedding( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15117,8 +15144,8 @@ static void ggml_compute_forward_argsort( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15340,8 +15367,8 @@ static void ggml_compute_forward_flash_attn_ext( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15676,8 +15703,8 @@ static void ggml_compute_forward_flash_attn_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15798,8 +15825,8 @@ static void ggml_compute_forward_ssm_conv( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15919,8 +15946,8 @@ static void ggml_compute_forward_ssm_scan( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -15982,8 +16009,8 @@ static void ggml_compute_forward_win_part( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16043,8 +16070,8 @@ static void ggml_compute_forward_win_unpart( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16111,8 +16138,8 @@ static void ggml_compute_forward_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16158,8 +16185,8 @@ static void ggml_compute_forward_get_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16239,8 +16266,8 @@ static void ggml_compute_forward_add_rel_pos( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16285,8 +16312,8 @@ static void ggml_compute_forward_map_unary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16334,8 +16361,8 @@ static void ggml_compute_forward_map_binary( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16533,8 +16560,8 @@ static void ggml_compute_forward_cross_entropy_loss( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16620,8 +16647,8 @@ static void ggml_compute_forward_cross_entropy_loss_back( } break; default: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } @@ -16956,14 +16983,32 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } } //////////////////////////////////////////////////////////////////////////////// -static size_t ggml_hash_size(size_t min_sz) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { + size = ggml_hash_size(size); + struct ggml_hash_set result; + result.size = size; + result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); + result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t)); + return result; +} + +void ggml_hash_set_reset(struct ggml_hash_set * hash_set) { + memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size)); +} + +void ggml_hash_set_free(struct ggml_hash_set * hash_set) { + GGML_FREE(hash_set->used); + GGML_FREE(hash_set->keys); +} + +size_t ggml_hash_size(size_t min_sz) { // next primes after powers of two static const size_t primes[] = { 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, @@ -16974,7 +17019,7 @@ static size_t ggml_hash_size(size_t min_sz) { }; static const size_t n_primes = sizeof(primes)/sizeof(primes[0]); - // find the smallest prime that is larger or equal to min_sz + // find the smallest prime that is larger or equal than min_sz size_t l = 0; size_t r = n_primes; while (l < r) { @@ -16989,67 +17034,6 @@ static size_t ggml_hash_size(size_t min_sz) { return sz; } -static size_t ggml_hash(const void * p) { - return (size_t)p; -} - -size_t ggml_hash_find(const struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t h = ggml_hash(key) % hash_set.size; - - // linear probing - size_t i = h; - while (hash_set.keys[i] != NULL && hash_set.keys[i] != key) { - i = (i + 1) % hash_set.size; - if (i == h) { - // visited all hash table entries -> not found - return GGML_HASHTABLE_FULL; - } - } - return i; -} - -bool ggml_hash_contains(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - return i != GGML_HASHTABLE_FULL && hash_set.keys[i] == key; -} - -size_t ggml_hash_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - if (hash_set.keys[i] == key) { - return GGML_HASHTABLE_ALREADY_EXISTS; - } - - // insert - GGML_ASSERT(hash_set.keys[i] == NULL); - hash_set.keys[i] = key; - return i; -} - -size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tensor * key) { - size_t i = ggml_hash_find(hash_set, key); - - GGML_ASSERT(i != GGML_HASHTABLE_FULL); - - hash_set.keys[i] = key; - return i; -} - -struct ggml_hash_set ggml_hash_set_new(size_t size) { - size = ggml_hash_size(size); - struct ggml_hash_set result; - result.size = size; - result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size); - memset(result.keys, 0, sizeof(struct ggml_tensor *) * size); - return result; -} - -static void ggml_hash_set_free(struct ggml_hash_set hash_set) { - GGML_FREE(hash_set.keys); -} - struct hash_map { struct ggml_hash_set set; struct ggml_tensor ** vals; @@ -17058,13 +17042,12 @@ struct hash_map { static struct hash_map * ggml_new_hash_map(size_t size) { struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map)); result->set = ggml_hash_set_new(size); - result->vals = GGML_MALLOC(sizeof(struct ggml_tensor *) * result->set.size); - memset(result->vals, 0, sizeof(struct ggml_tensor *) * result->set.size); + result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *)); return result; } static void ggml_hash_map_free(struct hash_map * map) { - ggml_hash_set_free(map->set); + ggml_hash_set_free(&map->set); GGML_FREE(map->vals); GGML_FREE(map); } @@ -17085,7 +17068,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - if (!ggml_hash_contains(graph->visited_hash_table, node)) { + if (!ggml_hash_contains(&graph->visited_hash_set, node)) { return node; } @@ -17100,8 +17083,8 @@ static struct ggml_tensor * ggml_recompute_graph_node( return node; } - size_t i = ggml_hash_find(replacements->set, node); - GGML_ASSERT(i != GGML_HASHTABLE_FULL); // assert that not full + size_t i = ggml_hash_find(&replacements->set, node); + GGML_ASSERT(i != GGML_HASHSET_FULL); // assert that not full if (replacements->set.keys[i] == node) { return replacements->vals[i]; } @@ -17159,8 +17142,8 @@ void ggml_build_backward_gradient_checkpointing( // insert checkpoints in replacements for (int i = 0; i < n_checkpoints; ++i) { - size_t k = ggml_hash_find(replacements->set, checkpoints[i]); - GGML_ASSERT(k != GGML_HASHTABLE_FULL); // assert that not full + size_t k = ggml_hash_find(&replacements->set, checkpoints[i]); + GGML_ASSERT(k != GGML_HASHSET_FULL); // assert that not full GGML_ASSERT(replacements->set.keys[k] == NULL); // assert that we don't overwrite replacements->set.keys[k] = checkpoints[i]; replacements->vals[k] = checkpoints[i]; @@ -17188,7 +17171,7 @@ void ggml_build_backward_gradient_checkpointing( // functions to change gradients considering the case that input a might be initial gradient with zero value -static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return b; } else { @@ -17196,7 +17179,7 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); @@ -17205,7 +17188,7 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg } } -static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_repeat(ctx, b, a); } else { @@ -17213,7 +17196,7 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g } } -static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set zero_table) { +static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { return ggml_neg(ctx, b); } else { @@ -17221,7 +17204,7 @@ static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct gg } } -static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set zero_table) { +static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, struct ggml_hash_set * zero_table) { struct ggml_tensor * src0 = tensor->src[0]; struct ggml_tensor * src1 = tensor->src[1]; struct ggml_tensor * src2 = tensor->src[2]; @@ -17390,8 +17373,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MEAN: case GGML_OP_ARGMAX: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_REPEAT: { // necessary for llama @@ -17414,16 +17397,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CONCAT: { - GGML_ASSERT(false); // TODO: implement - } break; + GGML_ABORT("fatal error"); // TODO: implement + } case GGML_OP_SILU_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_RMS_NORM: { // necessary for llama @@ -17439,12 +17422,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_RMS_NORM_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_GROUP_NORM: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_MUL_MAT: { // https://cs231n.github.io/optimization-2/#staged @@ -17505,12 +17488,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_MUL_MAT_ID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_OUT_PROD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_SCALE: { // necessary for llama @@ -17686,12 +17669,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_GET_ROWS_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_DIAG_MASK_INF: { // necessary for llama @@ -17729,8 +17712,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SOFT_MAX_BACK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ROPE: { // necessary for llama @@ -17805,52 +17788,52 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CLAMP: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_IM2COL: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_CONV_TRANSPOSE_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_1D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_POOL_2D: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_UPSCALE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_PAD: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARANGE: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_TIMESTEP_EMBEDDING: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_ARGSORT: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_LEAKY_RELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_FLASH_ATTN_EXT: { struct ggml_tensor * flash_grad = NULL; @@ -17906,13 +17889,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_FLASH_ATTN_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_UNARY: @@ -17950,12 +17933,12 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_TANH: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_ELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_RELU: { if (src0->grad) { @@ -17969,16 +17952,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_UNARY_OP_SIGMOID: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_GELU_QUICK: { - GGML_ASSERT(false); // TODO: not implemented - } break; + GGML_ABORT("fatal error"); // TODO: not implemented + } case GGML_UNARY_OP_SILU: { // necessary for llama @@ -17990,7 +17973,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_GET_REL_POS: @@ -18004,8 +17987,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_CROSS_ENTROPY_LOSS: { if (src0->grad) { @@ -18020,16 +18003,16 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - GGML_ASSERT(false); // not supported - } break; + GGML_ABORT("fatal error"); // not supported + } case GGML_OP_NONE: { // nop } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } for (int i = 0; i < GGML_MAX_SRC; ++i) { @@ -18049,7 +18032,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * } // check if already visited - if (ggml_hash_insert(cgraph->visited_hash_table, node) == GGML_HASHTABLE_ALREADY_EXISTS) { + if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) { return; } @@ -18131,7 +18114,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size); for (int i = 0; i < gf->n_nodes; i++) { if (gf->grads[i]) { - ggml_hash_insert(zero_table, gf->grads[i]); + ggml_hash_insert(&zero_table, gf->grads[i]); } } @@ -18141,7 +18124,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * // inplace operations to add gradients are not created by ggml_compute_backward // use allocator to automatically make inplace operations if (node->grad) { - ggml_compute_backward(ctx, node, zero_table); + ggml_compute_backward(ctx, node, &zero_table); } } @@ -18154,16 +18137,29 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } - ggml_hash_set_free(zero_table); + ggml_hash_set_free(&zero_table); +} + +static void * incr_ptr_aligned(void ** p, size_t size, size_t align) { + void * ptr = *p; + ptr = (void *) GGML_PAD((uintptr_t) ptr, align); + *p = (void *) ((char *) ptr + size); + return ptr; } static size_t ggml_graph_nbytes(size_t size, bool grads) { - size_t nbytes = sizeof(struct ggml_cgraph); - nbytes += size * sizeof(struct ggml_tensor *) * 2; // leafs + nodes + size_t hash_size = ggml_hash_size(size * 2); + void * p = 0; + incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1); + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs + incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys if (grads) { - nbytes += size * sizeof(struct ggml_tensor *); // grads + incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads } - nbytes += ggml_hash_size(size * 2) * sizeof(struct ggml_tensor *); // hash set + incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); + + size_t nbytes = (size_t) p; return nbytes; } @@ -18180,19 +18176,19 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); - struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); - + // the size of the hash table is doubled since it needs to hold both nodes and leafs size_t hash_size = ggml_hash_size(size * 2); - struct ggml_tensor ** nodes_ptr = data_start; - struct ggml_tensor ** leafs_ptr = nodes_ptr + size; - struct ggml_tensor ** hash_keys_ptr = leafs_ptr + size; - struct ggml_tensor ** grads_ptr = grads ? hash_keys_ptr + hash_size : NULL; + + void * p = cgraph + 1; + + struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); // check that we allocated the correct amount of memory - assert(obj_size == (size_t) ( - (grads ? (char *)(grads_ptr + size) : (char *)(hash_keys_ptr + hash_size)) - (char *)cgraph)); - - memset(hash_keys_ptr, 0, hash_size * sizeof(struct ggml_tensor *)); + assert(obj_size == (size_t)((char *)p - (char *)cgraph)); *cgraph = (struct ggml_cgraph) { /*.size =*/ size, @@ -18201,10 +18197,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.nodes =*/ nodes_ptr, /*.grads =*/ grads_ptr, /*.leafs =*/ leafs_ptr, - /*.hash_table =*/ { hash_size, hash_keys_ptr }, + /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; + ggml_hash_set_reset(&cgraph->visited_hash_set); + return cgraph; } @@ -18220,7 +18218,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.nodes =*/ cgraph0->nodes + i0, /*.grads =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL, /*.leafs =*/ NULL, - /*.hash_table =*/ { 0, NULL }, + /*.hash_table =*/ { 0, NULL, NULL }, /*.order =*/ cgraph0->order, }; @@ -18230,7 +18228,7 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { GGML_ASSERT(dst->size >= src->n_leafs); GGML_ASSERT(dst->size >= src->n_nodes); - GGML_ASSERT(dst->visited_hash_table.size >= src->visited_hash_table.size); + GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size); dst->n_leafs = src->n_leafs; dst->n_nodes = src->n_nodes; @@ -18251,9 +18249,9 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { } } - for (size_t i = 0; i < src->visited_hash_table.size; ++i) { - if (src->visited_hash_table.keys[i]) { - ggml_hash_insert(dst->visited_hash_table, src->visited_hash_table.keys[i]); + for (size_t i = 0; i < src->visited_hash_set.size; ++i) { + if (src->visited_hash_set.keys[i]) { + ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); } } } @@ -18279,7 +18277,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { void ggml_graph_clear(struct ggml_cgraph * cgraph) { cgraph->n_leafs = 0; cgraph->n_nodes = 0; - memset(cgraph->visited_hash_table.keys, 0, cgraph->visited_hash_table.size * sizeof(struct ggml_tensor *)); + ggml_hash_set_reset(&cgraph->visited_hash_set); } // @@ -18471,7 +18469,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = n_threads; } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } break; case GGML_OP_SILU_BACK: @@ -18598,8 +18596,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: { fprintf(stderr, "%s: op not implemented: ", __func__); @@ -18608,8 +18606,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } else { fprintf(stderr, "%d\n", node->op); } - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } } assert(n_tasks > 0); @@ -18719,7 +18717,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa cur += sizeof(float)*ne00*ne01*ne02; cur += sizeof(float)*ne10*ne11; } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } break; case GGML_OP_CONV_TRANSPOSE_2D: @@ -18765,8 +18763,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } break; case GGML_OP_COUNT: { - GGML_ASSERT(false); - } break; + GGML_ABORT("fatal error"); + } default: break; } @@ -20000,9 +19998,9 @@ static enum ggml_opt_result linesearch_backtracking( (*step) *= width; } - GGML_ASSERT(false && "line search failed"); + GGML_ABORT("line search failed"); - return GGML_LINESEARCH_FAIL; + //return GGML_LINESEARCH_FAIL; } static enum ggml_opt_result ggml_opt_lbfgs( @@ -20270,9 +20268,9 @@ static enum ggml_opt_result ggml_opt_lbfgs( step[0] = 1.0; } - GGML_ASSERT(false && "lbfgs failed"); + GGML_ABORT("lbfgs failed"); - return GGML_OPT_RESULT_DID_NOT_CONVERGE; + //return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { @@ -20967,10 +20965,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } if (!ok) { @@ -21551,12 +21549,12 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); GGML_FREE((void *)data); } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { - GGML_ASSERT(false && "nested arrays not supported"); + GGML_ABORT("nested arrays not supported"); } else { gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); } } break; - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } } @@ -21565,7 +21563,7 @@ void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { if (gguf_find_tensor(ctx, tensor->name) != -1) { - GGML_ASSERT(false && "duplicated tensor name"); + GGML_ABORT("duplicated tensor name"); } const int idx = ctx->header.n_tensors; @@ -21598,7 +21596,7 @@ void gguf_add_tensor( void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].type = type; @@ -21607,7 +21605,7 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { const int idx = gguf_find_tensor(ctx, name); if (idx < 0) { - GGML_ASSERT(false && "tensor not found"); + GGML_ABORT("tensor not found"); } ctx->infos[idx].data = data; @@ -21736,10 +21734,10 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } } break; case GGUF_TYPE_ARRAY: - default: GGML_ASSERT(false && "invalid type"); break; + default: GGML_ABORT("invalid type"); } } break; - default: GGML_ASSERT(false && "invalid type"); + default: GGML_ABORT("invalid type"); } } @@ -21800,7 +21798,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { FILE * file = ggml_fopen(fname, "wb"); if (!file) { - GGML_ASSERT(false && "failed to open file for writing"); + GGML_ABORT("failed to open file for writing"); } struct gguf_buf buf = gguf_buf_init(16*1024); diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bd9322e2f..b123d7331 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -221,7 +221,7 @@ static void llama_grammar_advance_stack( // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // those - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc return; } } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } const std::string & piece = vocab->cache_token_to_piece.at(token); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index c482b3689..133094904 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) { return strtol(buf.c_str(), NULL, 16); } case LLAMA_VOCAB_TYPE_BPE: { - GGML_ASSERT(false); - return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? + GGML_ABORT("fatal error"); + //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? } case LLAMA_VOCAB_TYPE_WPM: { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1396,7 +1396,7 @@ std::vector llama_tokenize_internal(const llama_vocab & vocab, } } break; case LLAMA_VOCAB_TYPE_NONE: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } return output; @@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) { return vocab.token_to_id.at(unicode_byte_to_utf8(ch)); } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token break; } default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } diff --git a/src/llama.cpp b/src/llama.cpp index 77f7d32f8..bc830c0ef 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2259,8 +2259,7 @@ struct llama_hparams { return n_head_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_head_kv(uint32_t il = 0) const { @@ -2268,8 +2267,7 @@ struct llama_hparams { return n_head_kv_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_ff(uint32_t il = 0) const { @@ -2277,8 +2275,7 @@ struct llama_hparams { return n_ff_arr[il]; } - GGML_ASSERT(false); - return 0; + GGML_ABORT("fatal error"); } uint32_t n_gqa(uint32_t il = 0) const { @@ -8072,7 +8069,7 @@ static struct ggml_tensor * llm_build_moe_ffn( cb(gate, "ffn_moe_gelu", il); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] @@ -8635,8 +8632,8 @@ struct llm_build_context { } break; default: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } cb(cur, "result_embd_pooled", -1); @@ -8891,7 +8888,7 @@ struct llm_build_context { Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -11723,7 +11720,7 @@ struct llm_build_context { switch (model.type) { case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; - default: GGML_ASSERT(false); + default: GGML_ABORT("fatal error"); }; cb(Qcur, "Qcur_scaled", il); @@ -13888,7 +13885,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_jais(); } break; default: - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } // add on pooling layer @@ -14687,8 +14684,8 @@ static int llama_decode_internal( } break; case LLAMA_POOLING_TYPE_UNSPECIFIED: { - GGML_ASSERT(false && "unknown pooling type"); - } break; + GGML_ABORT("unknown pooling type"); + } } } n_outputs_prev += lctx.n_outputs; @@ -15079,7 +15076,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA - GGML_ASSERT(false && "Deepseek2 does not support K-shift"); + GGML_ABORT("Deepseek2 does not support K-shift"); } { @@ -15218,7 +15215,7 @@ static void llama_tensor_dequantize_internal( } else if (ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { - GGML_ASSERT(false); // unreachable + GGML_ABORT("fatal error"); // unreachable } return; } @@ -16904,8 +16901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { // all model arches should be listed explicitly here case LLM_ARCH_UNKNOWN: - GGML_ASSERT(false && "unknown architecture"); - break; + GGML_ABORT("unknown architecture"); } return LLAMA_ROPE_TYPE_NONE; @@ -18469,7 +18465,7 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } @@ -18514,7 +18510,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG - GGML_ASSERT(false); + GGML_ABORT("fatal error"); #endif return nullptr; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2c03c60d4..2fa59fd0a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -94,7 +94,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m // This is going to create some weird integers though. ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } @@ -132,7 +132,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -1435,7 +1435,7 @@ struct test_argsort : public test_case { ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); } } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } } @@ -2462,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op return true; } - GGML_ASSERT(false); + GGML_ABORT("fatal error"); return false; } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 6374958fe..de858bd3b 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -166,12 +166,12 @@ static void test_sampler_queue( for (auto s : samplers_sequence) { switch (s){ case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; - case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break; - case 'y': GGML_ASSERT(false && "typical test not implemented"); break; + case 'f': GGML_ABORT("tail_free test not implemented"); break; + case 'y': GGML_ABORT("typical test not implemented"); break; case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; - case 't': GGML_ASSERT(false && "temperature test not implemented"); break; - default : GGML_ASSERT(false && "Unknown sampler"); break; + case 't': GGML_ABORT("temperature test not implemented"); break; + default : GGML_ABORT("Unknown sampler"); break; } llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests @@ -222,7 +222,7 @@ static void test_sampler_queue( GGML_ASSERT(candidates_p.data[0].id == max_token_id); GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); } else { - GGML_ASSERT(false); + GGML_ABORT("fatal error"); } } From bfb4c74981f0a40d757b450b596a9fe4ca983d26 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Sat, 27 Jul 2024 16:36:44 +0800 Subject: [PATCH 09/21] cann: Fix Multi-NPU execution error (#8710) * cann: fix multi-npu exec error * cann: update comment for ggml_backend_cann_supports_buft --- ggml/src/ggml-cann.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index ad5feea05..461febcc0 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async( return false; } + // need open both directions for memcpyasync between devices. + ggml_cann_set_device(cann_ctx_dst->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0)); ggml_cann_set_device(cann_ctx_src->device); ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, - cann_ctx_dst->stream())); + cann_ctx_src->stream())); - // record event on src stream - if (!cann_ctx_src->copy_event) { - ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event)); - } - - ACL_CHECK( - aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream())); - - // wait on dst stream for the copy to complete - ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), - cann_ctx_src->copy_event)); + //TODO: workaround for Event didn`t work here. + aclrtSynchronizeStream(cann_ctx_src->stream()); } else { // src and dst are on the same backend ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, @@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * * This function determines whether the CANN backend supports the given backend * buffer type by comparing the device context of the backend and buffer type. - * It returns true if the device associated with the buffer type matches the - * device associated with the backend. + * It returns true if the devices are same between the backend context and + * buffer type context. * * @param backend Pointer to the CANN backend. * @param buft Pointer to the backend buffer type to check. @@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { */ GGML_CALL static bool ggml_backend_cann_supports_buft( ggml_backend_t backend, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_cann_buffer_type_name; - - GGML_UNUSED(backend); + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_context * cann_ctx = + (ggml_backend_cann_context *)backend->context; + ggml_backend_cann_buffer_type_context * buft_ctx = + (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == cann_ctx->device; + } + return false; } /** From 9d03d085dd6cb275c078690bb64073b9b043e95f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 27 Jul 2024 12:45:02 +0200 Subject: [PATCH 10/21] common : add --no-warmup option for main/llama-cli (#8712) This commit adds a --no-warmup option for llama-cli. The motivation for this is that it can be convenient to skip the warmup llama_decode call when debugging. Signed-off-by: Daniel Bevenius --- common/common.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index ec44a0552..60c7eac75 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa else { invalid_param = true; } return true; } + if (arg == "--no-warmup") { + params.warmup = false; + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" }); options.push_back({ "server infill", " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); From 92090eca212650727e38b335c1d4accfbcc9b79c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 14:59:29 +0300 Subject: [PATCH 11/21] llama : add function for model-based max number of graph nodes (#8622) * llama : model-based max number of graph nodes ggml-ci * llama : disable 405B max_nodes path due to lack of complaints ggml-ci --- src/llama.cpp | 96 ++++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 43 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index bc830c0ef..c9cdbb343 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -101,7 +101,6 @@ #endif // bump if necessary -#define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2 @@ -3567,6 +3566,15 @@ namespace GGUFMeta { using llama_buf_map = std::unordered_map; +// TODO: update when needed or think of some clever automatic way to do this +static size_t llama_model_max_nodes(const llama_model & /*model*/) { + //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B + // return 32768; + //} + + return 8192; +} + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -8396,7 +8404,7 @@ struct llm_build_context { } struct ggml_cgraph * build_k_shift() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.size == n_ctx); @@ -8427,7 +8435,7 @@ struct llm_build_context { } struct ggml_cgraph * build_s_copy() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); GGML_ASSERT(kv_self.recurrent); @@ -8450,7 +8458,7 @@ struct llm_build_context { } struct ggml_cgraph * build_defrag(const std::vector & ids) { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); for (uint32_t i = 0; i < ids.size(); ++i) { const uint32_t id = ids[i]; @@ -8691,7 +8699,7 @@ struct llm_build_context { } struct ggml_cgraph * build_llama() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -8834,7 +8842,7 @@ struct llm_build_context { } struct ggml_cgraph * build_baichuan() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -8949,7 +8957,7 @@ struct llm_build_context { } struct ggml_cgraph * build_xverse() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9052,7 +9060,7 @@ struct llm_build_context { } struct ggml_cgraph * build_falcon() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9172,7 +9180,7 @@ struct llm_build_context { } struct ggml_cgraph * build_grok() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9329,7 +9337,7 @@ struct llm_build_context { } struct ggml_cgraph * build_dbrx() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -9455,7 +9463,7 @@ struct llm_build_context { } struct ggml_cgraph * build_starcoder() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9559,7 +9567,7 @@ struct llm_build_context { } struct ggml_cgraph * build_refact() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -9653,7 +9661,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bert() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9847,7 +9855,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bloom() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -9948,7 +9956,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mpt() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10238,7 +10246,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10350,7 +10358,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -10462,7 +10470,7 @@ struct llm_build_context { } struct ggml_cgraph * build_qwen2moe() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -10608,7 +10616,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10729,7 +10737,7 @@ struct llm_build_context { } struct ggml_cgraph * build_phi3() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -10961,7 +10969,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gpt2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11066,7 +11074,7 @@ struct llm_build_context { } struct ggml_cgraph * build_codeshell() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -11177,7 +11185,7 @@ struct llm_build_context { } struct ggml_cgraph * build_orion() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11295,7 +11303,7 @@ struct llm_build_context { } struct ggml_cgraph * build_internlm2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11416,7 +11424,7 @@ struct llm_build_context { // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 // based on the original build_llama() function struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11560,7 +11568,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11668,7 +11676,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gemma2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -11803,7 +11811,7 @@ struct llm_build_context { struct ggml_cgraph * build_starcoder2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -11922,7 +11930,7 @@ struct llm_build_context { } struct ggml_cgraph * build_mamba() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t d_model = n_embd; const int64_t d_conv = hparams.ssm_d_conv; @@ -12071,7 +12079,7 @@ struct llm_build_context { struct ggml_cgraph * build_command_r() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12225,7 +12233,7 @@ struct llm_build_context { // * removed bias // * removed MoE struct ggml_cgraph * build_olmo() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12349,7 +12357,7 @@ struct llm_build_context { } struct ggml_cgraph * build_openelm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -12474,7 +12482,7 @@ struct llm_build_context { } struct ggml_cgraph * build_gptneox() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -12616,7 +12624,7 @@ struct llm_build_context { } struct ggml_cgraph * build_arctic() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12748,7 +12756,7 @@ struct llm_build_context { } struct ggml_cgraph * build_deepseek2() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -12976,7 +12984,7 @@ struct llm_build_context { } struct ggml_cgraph * build_bitnet() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -13116,7 +13124,7 @@ struct llm_build_context { } struct ggml_cgraph * build_t5() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens int32_t n_tokens = this->n_tokens; @@ -13433,7 +13441,7 @@ struct llm_build_context { } struct ggml_cgraph * build_jais() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -13525,7 +13533,7 @@ struct llm_build_context { } struct ggml_cgraph * build_chatglm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -14870,9 +14878,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation // - x2 for keys and values - //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer); // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer); + const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -16762,8 +16770,10 @@ struct llama_context * llama_new_context_with_model( } } + const size_t max_nodes = llama_model_max_nodes(*model); + // buffer used to store the computation graph and the tensor meta data - ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false)); + ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false)); // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = @@ -16776,7 +16786,7 @@ struct llama_context * llama_new_context_with_model( // currently this is only implemented in the CUDA backend pipeline_parallel = false; #endif - ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES, pipeline_parallel); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched)); From b5e95468b1676e1e5c9d80d1eeeb26f542a38f42 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 27 Jul 2024 05:03:45 -0700 Subject: [PATCH 12/21] llama : add support for llama 3.1 rope scaling factors (#8676) * Add llama 3.1 rope scaling factors to llama conversion and inference This commit generates the rope factors on conversion and adds them to the resulting model as a tensor. At inference time, these factors are passed to the `ggml_rope_ext` rope oepration, improving results for context windows above 8192 * Update convert_hf_to_gguf.py Co-authored-by: compilade * address comments * address comments * Update src/llama.cpp Co-authored-by: compilade * Update convert_hf_to_gguf.py Co-authored-by: compilade --------- Co-authored-by: compilade --- convert_hf_to_gguf.py | 28 ++++++++++++++++++++++++++++ src/llama.cpp | 14 ++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4087187c1..8ba3c5844 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1570,6 +1570,34 @@ class LlamaModel(Model): return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32)) + super().prepare_tensors() if self._experts is not None: diff --git a/src/llama.cpp b/src/llama.cpp index c9cdbb343..0345d0062 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2451,6 +2451,7 @@ struct llama_layer { // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; + struct ggml_tensor * rope_freqs = nullptr; // bitnet scale struct ggml_tensor * wq_scale; @@ -6059,6 +6060,8 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (n_expert == 0) { layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -8536,6 +8539,10 @@ struct llm_build_context { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { return model.layers[il].rope_long; } @@ -8730,6 +8737,9 @@ struct llm_build_context { // self-attention { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + // compute Q and K and RoPE them struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -8753,14 +8763,14 @@ struct llm_build_context { } Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); From c12b6e8ee7d905e0f299caf311689189fb1b4ac5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 8 Jul 2024 12:03:42 +0200 Subject: [PATCH 13/21] ggml : remove unnecessary UNUSED macro call (ggml/880) This commit removes an UNUSED macro call that is not needed as the variable n0 is used in the code and will not produce a warning. Signed-off-by: Daniel Bevenius --- ggml/src/ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c196fd5bf..a14d0d1db 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18078,7 +18078,6 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } const int n0 = cgraph->n_nodes; - UNUSED(n0); ggml_visit_parents(cgraph, tensor); From d2b851bfa131478665315bc5c7c707506c14d703 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Fri, 12 Jul 2024 17:24:20 +0300 Subject: [PATCH 14/21] cmake : only enable GGML_NATIVE and x86 flags if not crosscompiling (ggml/885) --- ggml/CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index be22a7460..1768a508b 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -50,9 +50,15 @@ else() set(GGML_BLAS_VENDOR_DEFAULT "Generic") endif() +if (CMAKE_CROSSCOMPILING) + set(GGML_NATIVE_DEFAULT OFF) +else() + set(GGML_NATIVE_DEFAULT ON) +endif() + # general option(GGML_STATIC "ggml: static link libraries" OFF) -option(GGML_NATIVE "ggml: enable -march=native flag" ON) +option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) option(GGML_LTO "ggml: enable link time optimization" OFF) option(GGML_CCACHE "ggml: use ccache if available" ON) @@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF) option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) # instruction set specific -if (GGML_NATIVE) +if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT) set(INS_ENB OFF) else() set(INS_ENB ON) From 203b7f1531303a060730ec1d1e01920e70302398 Mon Sep 17 00:00:00 2001 From: Tony Wasserka <4840017+neobrain@users.noreply.github.com> Date: Sat, 20 Jul 2024 20:49:44 +0200 Subject: [PATCH 15/21] vulkan : initialize vk_buffer_struct members to VK_NULL_HANDLE (ggml/893) This prevents invalid frees when destroying a partially initialized vk_buffer_struct. For example, this could happen in ggml_vk_create_buffer when running out of device memory. Co-authored-by: Tony Wasserka --- ggml/src/ggml-vulkan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 74991f6d1..fa68360b9 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -236,8 +236,8 @@ struct vk_device_struct { }; struct vk_buffer_struct { - vk::Buffer buffer; - vk::DeviceMemory device_memory; + vk::Buffer buffer = VK_NULL_HANDLE; + vk::DeviceMemory device_memory = VK_NULL_HANDLE; vk::MemoryPropertyFlags memory_property_flags; void * ptr; size_t size = 0; From 9f77d899b7b0d56496f679e54b797da6199fed8e Mon Sep 17 00:00:00 2001 From: Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:32:02 +0300 Subject: [PATCH 16/21] ggml: add support for float16 input tensors in pooling operations (ggml/895) * Add support for float16 tensors in 1d pooling operations * Add support for float16 input tensors in 2d pooling operations * code cleanup remove unnecessary casting during srow ptr initialization --------- Co-authored-by: vanaka11 --- ggml/src/ggml.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a14d0d1db..c76d00a39 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -14746,7 +14746,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_tensor * src = dst->src[0]; - assert(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14759,10 +14759,8 @@ static void ggml_compute_forward_pool_1d_sk_p0( const int64_t rs = dst->ne[0]; while (cdata < data_end) { - const float * const srow = (const float *)cdata; - + const void * srow = (const void *)cdata; int j = 0; - for (int64_t i = 0; i < rs; ++i) { switch (op) { case GGML_OP_POOL_AVG: drow[i] = 0; break; @@ -14770,10 +14768,11 @@ static void ggml_compute_forward_pool_1d_sk_p0( case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: drow[i] += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + case GGML_OP_POOL_AVG: drow[i] += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } ++j; } @@ -14814,7 +14813,7 @@ static void ggml_compute_forward_pool_2d( const struct ggml_tensor * src = dst->src[0]; - GGML_ASSERT(src->type == GGML_TYPE_F32); + assert(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16); if (params->ith != 0) { return; @@ -14857,14 +14856,15 @@ static void ggml_compute_forward_pool_2d( for (int ky = 0; ky < k1; ++ky) { if (iy + ky < 0 || iy + ky >= src->ne[1]) continue; - const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky)); + const void * srow = (const void *)(cdata + src->nb[1] * (iy + ky)); for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { - case GGML_OP_POOL_AVG: *out += srow[j]; break; - case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break; - case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); + case GGML_OP_POOL_AVG: *out += srow_j; break; + case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; + case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } } } From a05ca9369716a8319014cd1fc365980d43f8aae9 Mon Sep 17 00:00:00 2001 From: Mahesh Madhav <67384846+heshpdx@users.noreply.github.com> Date: Thu, 25 Jul 2024 00:54:08 -0700 Subject: [PATCH 17/21] ggml : loop tiling optimizations for scalar path (ggml/898) Apply a loop tiling technique to the generic path, which provides performance upside for ISAs with enough registers to take advantage of it. Also helps the compiler optimize this path. --- ggml/src/ggml-quants.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 1c6c85aac..aa936fe5f 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8; - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); } @@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi = 0; + int sumi0 = 0 + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const int v0 = (x[ib].qs[j] & 0x0F); const int v1 = (x[ib].qs[j] >> 4); - sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); + sumi0 += (v0 * y[ib].qs[j]); + sumi1 += (v1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } @@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); - const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16; + const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16); + const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16); - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; } @@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - int sumi = 0; + int sumi0 = 0; + int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; @@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; - sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); + sumi0 += (x0 * y[ib].qs[j]); + sumi1 += (x1 * y[ib].qs[j + qk/2]); } + int sumi = sumi0 + sumi1; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); } From ae7985cd7beca3b849328d169a8d592469cd021f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 15:53:48 +0300 Subject: [PATCH 18/21] sync : ggml ggml-ci --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 80159b70b..998b23ac6 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -e3b3846976c94163f2b3dd128cc959782653edbb +31d544f87835a55602883fe09156bb85a4c163d8 From 345c8c0c87a97c1595f9c8b14833d531c8c7d8df Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 15:57:09 +0300 Subject: [PATCH 19/21] ggml : add missing semicolon (#0) ggml-ci --- ggml/src/ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index aa936fe5f..9016314f5 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4477,7 +4477,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumf = hsum_float_8(acc) + summs; #endif for (; ib < nb; ++ib) { - int sumi0 = 0 + int sumi0 = 0; int sumi1 = 0; for (int j = 0; j < qk/2; ++j) { From 56f20aa25d5f97248a204b473c99f4040900f0e5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 17:19:35 +0300 Subject: [PATCH 20/21] scripts : sync ggml-aarch64 sources --- scripts/sync-ggml-am.sh | 4 ++++ scripts/sync-ggml.sh | 2 ++ 2 files changed, 6 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index ba3bedf21..f624e4881 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -102,6 +102,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake # # src/ggml.c -> ggml/src/ggml.c + # src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c + # src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h # src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend.c -> ggml/src/ggml-backend.c @@ -143,6 +145,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \ + -e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 402446ef9..af3784c3d 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -5,6 +5,8 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c +cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c +cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c From 5e2727fe0321c38d1664d26173c654fa1801dc5f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 27 Jul 2024 18:08:31 +0300 Subject: [PATCH 21/21] scripts : sync vulkan-shaders (#0) --- scripts/sync-ggml-am.sh | 2 ++ scripts/sync-ggml.sh | 1 + 2 files changed, 3 insertions(+) diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index f624e4881..c40025356 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -119,6 +119,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-sycl/* -> ggml/src/ggml-sycl/ # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp + # src/vulkan-shaders/* -> ggml/src/vulkan-shaders/ # # include/ggml.h -> ggml/include/ggml.h # include/ggml-alloc.h -> ggml/include/ggml-alloc.h @@ -162,6 +163,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ + -e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index af3784c3d..d6d7d0a60 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -23,6 +23,7 @@ cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp +cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/ cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h