From 2759916d86b70e7aceaed4d0b4e7ed126f0f9e51 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 4 Dec 2024 01:28:59 -0600 Subject: [PATCH 01/99] vulkan: Implement "fast divide" (mul+shift) for unary ops like copy (#10642) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 45 ++++++++++++++++++- .../vulkan-shaders/generic_unary_head.comp | 27 ++++++++--- tests/test-backend-ops.cpp | 2 + 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 17e1be105..07b45d6b9 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -353,7 +353,45 @@ struct vk_op_unary_push_constants { uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t d_offset; float param1; float param2; + uint32_t ne0_012mp; uint32_t ne0_012L; + uint32_t ne0_01mp; uint32_t ne0_01L; + uint32_t ne0_0mp; uint32_t ne0_0L; + uint32_t ne1_012mp; uint32_t ne1_012L; + uint32_t ne1_01mp; uint32_t ne1_01L; + uint32_t ne1_0mp; uint32_t ne1_0L; }; +static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); + +// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1. +// Precompute mp (m' in the paper) and L such that division +// can be computed using a multiply (high 32b of 64b result) +// and a shift: +// +// n/d = (mulhi(n, mp) + n) >> L; +void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +{ + // compute L = ceil(log2(d)); + L = 0; + while (L < 32 && (uint32_t{1} << L) < d) { + L++; + } + + mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); +} + +template void init_pushconst_fastdiv(T &p) { + static_assert(!std::is_const::value, "unexpected type"); +} + +template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { + // Compute magic values to divide by these six numbers. + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); +} struct vk_op_binary_push_constants { uint32_t ne; @@ -2914,13 +2952,14 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - const vk_op_unary_push_constants pc = { + vk_op_unary_push_constants pc = { (uint32_t)ne, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, }; + init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); } @@ -4125,7 +4164,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { } template -static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc, bool dryrun = false) { +static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, PC&& pc, bool dryrun = false) { VK_LOG_DEBUG("ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; if (src1 != nullptr) { std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; @@ -4165,6 +4204,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co const uint64_t ned3 = dst->ne[3]; const uint64_t ned = ned0 * ned1; + init_pushconst_fastdiv(pc); + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op); if (pipeline == nullptr) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp index 4e1fa3af3..ab7c9d7eb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.comp @@ -8,6 +8,13 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint d_offset; float param1; float param2; + + uint ne0_012mp; uint ne0_012L; + uint ne0_01mp; uint ne0_01L; + uint ne0_0mp; uint ne0_0L; + uint ne1_012mp; uint ne1_012L; + uint ne1_01mp; uint ne1_01L; + uint ne1_0mp; uint ne1_0L; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -17,22 +24,30 @@ uint get_idx() { return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; } +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + // msbs = mulhi(n, mp) + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} + uint src0_idx(uint idx) { - const uint i03 = idx / (p.ne02*p.ne01*p.ne00); + const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = (idx - i03_offset - i02_offset) / p.ne00; + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = idx / (p.ne12*p.ne11*p.ne10); + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 87c92dadd..807d271c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3862,6 +3862,8 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); From 8d0cfd554a9ae545ff94d27e04458f537b4e8c0e Mon Sep 17 00:00:00 2001 From: JFLFY2255 Date: Wed, 4 Dec 2024 17:42:50 +0800 Subject: [PATCH 02/99] llama: Support MiniCPM-1B (with & w/o longrope) (#10559) --- convert_hf_to_gguf.py | 57 ++++++++----- gguf-py/gguf/constants.py | 9 +- include/llama.h | 3 +- src/llama.cpp | 175 +++++--------------------------------- 4 files changed, 61 insertions(+), 183 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b931049d1..d8df5cc00 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1831,29 +1831,40 @@ class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + if self.hparams.get("rope_scaling") is not None: + if self.hparams["rope_scaling"].get("type") == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + rope_scaling = self.find_hparam(['rope_scaling'], True) + if rope_scaling is not None: + long_factors = rope_scaling.get('long_factor', None) + short_factors = rope_scaling.get('short_factor', None) + + if long_factors is None or short_factors is None: + raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') + + if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) def set_vocab(self): - self._set_vocab_llama_hf() - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -1863,9 +1874,9 @@ class MiniCPMModel(Model): # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7df23371c..703199fcb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -896,6 +896,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1388,9 +1390,10 @@ class TokenType(IntEnum): class RopeScalingType(Enum): - NONE = 'none' - LINEAR = 'linear' - YARN = 'yarn' + NONE = 'none' + LINEAR = 'linear' + YARN = 'yarn' + LONGROPE = 'longrope' class PoolingType(IntEnum): diff --git a/include/llama.h b/include/llama.h index e85f459fc..168c3fa1f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -185,7 +185,8 @@ extern "C" { LLAMA_ROPE_SCALING_TYPE_NONE = 0, LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, LLAMA_ROPE_SCALING_TYPE_YARN = 2, - LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, + LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE, }; enum llama_pooling_type { diff --git a/src/llama.cpp b/src/llama.cpp index 6a6f4c2a5..00f78639e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1036,6 +1036,8 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" }, + { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, @@ -1683,9 +1685,10 @@ struct LLM_TN { // static const std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, - { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" }, }; static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) { @@ -5580,8 +5583,12 @@ static void llm_load_hparams( case LLM_ARCH_MINICPM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); + ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); switch (hparams.n_layer) { + case 52: model.type = e_model::MODEL_1B; break; case 40: model.type = e_model::MODEL_2B; break; default: model.type = e_model::MODEL_UNKNOWN; } @@ -7065,7 +7072,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); } - if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { + if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) { LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale); LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale); LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); @@ -7690,7 +7697,13 @@ static bool llm_load_tensors( layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } if (n_expert == 0) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); @@ -13497,153 +13510,6 @@ struct llm_build_context { return gf; } - // ref: https://arxiv.org/abs/2203.03466 - // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738 - // based on the original build_llama() function - struct ggml_cgraph * build_minicpm() { - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); - - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); - - const int64_t n_embd = hparams.n_embd; - //TODO: if the model varies, these parameters need to be read from the model - const int64_t n_embd_base = 256; - const float scale_embd = 12.0f; - const float scale_depth = 1.4f; - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); - - // scale the input embeddings - inpL = ggml_scale(ctx0, inpL, scale_embd); - cb(inpL, "inp_scaled", -1); - - // inp_pos - contains the positions - struct ggml_tensor * inp_pos = build_inp_pos(); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - cur = llm_build_norm(ctx0, inpL, hparams, - model.layers[il].attn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - - // self-attention - { - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } - - struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } - - struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } - - Qcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Kcur, "Kcur", il); - - cur = llm_build_kv(ctx0, lctx, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - } - - if (il == n_layer - 1) { - // skip computing output for unused tokens - struct ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // scale_res - scale the hidden states for residual connection - const float scale_res = scale_depth/sqrtf(float(n_layer)); - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled", -1); - - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); - - // feed-forward network - { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_ffn(ctx0, lctx, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - } - - // scale the hidden states for residual connection - cur = ggml_scale(ctx0, cur, scale_res); - cb(cur, "hidden_scaled_ffn", -1); - - cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); - cb(cur, "l_out", il); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - cur = llm_build_norm(ctx0, cur, hparams, - model.output_norm, NULL, - LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head scaling - const float scale_lmhead = float(n_embd_base)/float(n_embd); - cur = ggml_scale(ctx0, cur, scale_lmhead); - cb(cur, "lmhead_scaling", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); - cb(cur, "result_output", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - struct ggml_cgraph * build_minicpm3() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -16742,6 +16608,7 @@ static struct ggml_cgraph * llama_build_graph( switch (model.arch) { case LLM_ARCH_LLAMA: + case LLM_ARCH_MINICPM: case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: { @@ -16825,10 +16692,6 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_internlm2(); } break; - case LLM_ARCH_MINICPM: - { - result = llm.build_minicpm(); - } break; case LLM_ARCH_MINICPM3: { result = llm.build_minicpm3(); From 253b7fde910731104670724391bfbcb94d97d0c3 Mon Sep 17 00:00:00 2001 From: ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:45:48 +0000 Subject: [PATCH 03/99] Fix HF repo commit to clone lora test models (#10649) --- tests/test-lora-conversion-inference.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test-lora-conversion-inference.sh b/tests/test-lora-conversion-inference.sh index fe90ce0d1..fb308a9ff 100755 --- a/tests/test-lora-conversion-inference.sh +++ b/tests/test-lora-conversion-inference.sh @@ -10,11 +10,16 @@ declare -a params=( MODELS_REPO=lora-tests MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO +COMMIT=c26d5fb85b4070a9e9c4e65d132c783b98086890 # Clone the Hugging Face repository if the directory does not exist if [ ! -d "$MODELS_REPO" ]; then echo "Cloning the Hugging Face repository..." git clone $MODELS_REPO_URL --depth 1 + cd $MODELS_REPO + git fetch --depth=1 origin $COMMIT + git reset --hard $COMMIT + cd - else echo "Repository already exists. Skipping clone." fi From 2803540814bf0a4e44d0960ff6afda6bac971c17 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:40:44 +0100 Subject: [PATCH 04/99] ggml-cpu : fix HWCAP2_I8MM value (#10646) --- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 23ae2e10c..e4a9ca013 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2425,7 +2425,7 @@ bool ggml_is_numa(void) { #endif #if !defined(HWCAP2_I8MM) -#define HWCAP2_I8MM 0 +#define HWCAP2_I8MM (1 << 13) #endif static void ggml_init_arm_arch_features(void) { From 59f4db10883a4f3e855cffbf2c3ab68430e95272 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 4 Dec 2024 14:45:40 +0100 Subject: [PATCH 05/99] ggml : add predefined list of CPU backend variants to build (#10626) * ggml : add predefined list of CPU backend variants to build * update CPU dockerfiles --- .devops/full.Dockerfile | 33 +- .devops/llama-cli.Dockerfile | 18 +- .devops/llama-server.Dockerfile | 22 +- ggml/CMakeLists.txt | 49 ++- ggml/src/CMakeLists.txt | 35 ++ ggml/src/ggml-backend-reg.cpp | 32 +- ggml/src/ggml-cpu/CMakeLists.txt | 595 +++++++++++++++------------- ggml/src/ggml-cpu/cpu-feats-x86.cpp | 85 ++-- ggml/src/ggml-cpu/ggml-cpu.c | 2 +- ggml/src/ggml-cpu/ggml-cpu.cpp | 10 +- scripts/build-cpu.sh | 12 - 11 files changed, 502 insertions(+), 391 deletions(-) delete mode 100755 scripts/build-cpu.sh diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index 2a06f82b7..d93c0be6a 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,23 +3,36 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -ENV LLAMA_CURL=1 +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; +FROM ubuntu:$UBUNTU_VERSION as runtime -RUN make -j$(nproc) +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 + +COPY requirements.txt /app/requirements.txt +COPY requirements /app/requirements +COPY .devops/tools.sh /app/tools.sh + +RUN pip install --upgrade pip setuptools wheel && \ + pip install -r /app/requirements.txt + +COPY --from=build /app/build/bin/ /app/ +COPY --from=build /app/lib/ /app/ +COPY --from=build /app/convert_hf_to_gguf.py /app/ +COPY --from=build /app/gguf-py /app/gguf-py ENV LC_ALL=C.utf8 -ENTRYPOINT ["/app/.devops/tools.sh"] +ENTRYPOINT ["/app/tools.sh"] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile index 7f741aa46..be234d55d 100644 --- a/.devops/llama-cli.Dockerfile +++ b/.devops/llama-cli.Dockerfile @@ -3,21 +3,27 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git cmake libcurl4-openssl-dev WORKDIR /app COPY . . -RUN make -j$(nproc) llama-cli +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ + mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime -RUN apt-get update && \ - apt-get install -y libgomp1 +WORKDIR /app -COPY --from=build /app/llama-cli /llama-cli +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev libgomp1 curl + +COPY --from=build /app/build/bin/llama-cli /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 -ENTRYPOINT [ "/llama-cli" ] +ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile index 7110dda9e..72ccde2fe 100644 --- a/.devops/llama-server.Dockerfile +++ b/.devops/llama-server.Dockerfile @@ -9,28 +9,20 @@ WORKDIR /app COPY . . - -RUN \ - # Build multiple versions of the CPU backend - scripts/build-cpu.sh avx -DGGML_AVX=ON -DGGML_AVX2=OFF && \ - scripts/build-cpu.sh avx2 -DGGML_AVX=ON -DGGML_AVX2=ON && \ - scripts/build-cpu.sh avx512 -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON && \ - scripts/build-cpu.sh amx -DGGML_AVX=ON -DGGML_AVX2=ON -DGGML_AVX512=ON -DGGML_AVX_VNNI=ON -DGGML_AVX512_VNNI=ON -DGGML_AMX_TILE=ON -DGGML_AMX_INT8=ON && \ - # Build llama-server - cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build --target llama-server -j $(nproc) && \ - # Copy the built libraries to /app/lib +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) && \ mkdir -p /app/lib && \ - mv libggml-cpu* /app/lib/ && \ find build -name "*.so" -exec cp {} /app/lib/ \; FROM ubuntu:$UBUNTU_VERSION AS runtime +WORKDIR /app + RUN apt-get update && \ apt-get install -y libcurl4-openssl-dev libgomp1 curl -COPY --from=build /app/build/bin/llama-server /llama-server -COPY --from=build /app/lib/ / +COPY --from=build /app/build/bin/llama-server /app/ +COPY --from=build /app/lib/ /app/ ENV LC_ALL=C.utf8 # Must be set to 0.0.0.0 so it can listen to requests from host machine @@ -38,4 +30,4 @@ ENV LLAMA_ARG_HOST=0.0.0.0 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] -ENTRYPOINT [ "/llama-server" ] +ENTRYPOINT [ "/app/llama-server" ] diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 06d371e09..1b3d98967 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -92,30 +92,33 @@ else() set(INS_ENB ON) endif() -option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) -option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) - -option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) -option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) -option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) -option(GGML_AVX512 "ggml: enable AVX512" OFF) -option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) -option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) -option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) -option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) -option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) -option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) -option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) +option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF) +option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON) +option(GGML_AVX "ggml: enable AVX" ${INS_ENB}) +option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF) +option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB}) +option(GGML_AVX512 "ggml: enable AVX512F" OFF) +option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF) +option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF) +option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF) if (NOT MSVC) - option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512 + # in MSVC F16C and FMA is implied with AVX2/AVX512 + option(GGML_FMA "ggml: enable FMA" ${INS_ENB}) + option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) + # MSVC does not seem to support AMX + option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF) + option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF) + option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF) endif() -option(GGML_LASX "ggml: enable lasx" ON) -option(GGML_LSX "ggml: enable lsx" ON) -option(GGML_RVV "ggml: enable rvv" ON) -option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_LASX "ggml: enable lasx" ON) +option(GGML_LSX "ggml: enable lsx" ON) +option(GGML_RVV "ggml: enable rvv" ON) +option(GGML_SVE "ggml: enable SVE" OFF) +option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) + if (WIN32) - set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version") + set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version") endif() # ggml core @@ -180,11 +183,7 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) set(CMAKE_C_STANDARD 11) set(CMAKE_C_STANDARD_REQUIRED true) -if (GGML_SYCL) - set(CMAKE_CXX_STANDARD 17) -else() - set(CMAKE_CXX_STANDARD 11) -endif() +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED true) set(THREADS_PREFER_PTHREAD_FLAG ON) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 19289f32b..f07533fdb 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -269,7 +269,42 @@ function(ggml_add_backend backend) endif() endfunction() +function(ggml_add_cpu_backend_variant tag_name) + set(GGML_CPU_TAG_NAME ${tag_name}) + # other: OPENMP LLAMAFILE CPU_HBM + foreach (feat NATIVE + AVX AVX2 AVX_VNNI FMA F16C + AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 + AMX_TILE AMX_INT8 AMX_BF16) + set(GGML_${feat} OFF) + endforeach() + + foreach (feat ${ARGN}) + set(GGML_${feat} ON) + endforeach() + + ggml_add_cpu_backend_variant_impl(${tag_name}) +endfunction() + ggml_add_backend(CPU) + +if (GGML_CPU_ALL_VARIANTS) + if (NOT GGML_BACKEND_DL) + message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL") + endif() + ggml_add_cpu_backend_variant(sandybridge AVX) + ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA) + ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512) + ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI) + if (NOT MSVC) + # MSVC doesn't support AVX-VNNI or AMX + ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI) + ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) + endif() +else () + ggml_add_cpu_backend_variant_impl("") +endif() + ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 2c4bf11b0..5cb0fb9d1 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -483,6 +483,10 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) best_score = s; best_path = entry.path().string(); } + } else { + if (!silent) { + GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); + } } } } @@ -505,15 +509,21 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { - ggml_backend_load_best("blas", true); - ggml_backend_load_best("cann", true); - ggml_backend_load_best("cuda", true); - ggml_backend_load_best("hip", true); - ggml_backend_load_best("kompute", true); - ggml_backend_load_best("metal", true); - ggml_backend_load_best("rpc", true); - ggml_backend_load_best("sycl", true); - ggml_backend_load_best("vulkan", true); - ggml_backend_load_best("musa", true); - ggml_backend_load_best("cpu", true); +#ifdef NDEBUG + bool silent = true; +#else + bool silent = false; +#endif + + ggml_backend_load_best("blas", silent); + ggml_backend_load_best("cann", silent); + ggml_backend_load_best("cuda", silent); + ggml_backend_load_best("hip", silent); + ggml_backend_load_best("kompute", silent); + ggml_backend_load_best("metal", silent); + ggml_backend_load_best("rpc", silent); + ggml_backend_load_best("sycl", silent); + ggml_backend_load_best("vulkan", silent); + ggml_backend_load_best("musa", silent); + ggml_backend_load_best("cpu", silent); } diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 5df63884c..bc326c059 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,319 +1,354 @@ -ggml_add_backend_library(ggml-cpu) - -list (APPEND GGML_CPU_SOURCES - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - amx/amx.cpp - amx/amx.h - amx/mmq.cpp - amx/mmq.h - ggml-cpu-impl.h - ) - -target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) -target_include_directories(ggml-cpu PRIVATE .) - -if (APPLE AND GGML_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) - target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) - - target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) +function(ggml_add_cpu_backend_variant_impl tag_name) + if (tag_name) + set(GGML_CPU_NAME ggml-cpu-${tag_name}) else() - message(WARNING "Accelerate framework not found") + set(GGML_CPU_NAME ggml-cpu) endif() -endif() -if (GGML_OPENMP) - find_package(OpenMP) - if (OpenMP_FOUND) - message(STATUS "OpenMP found") + ggml_add_backend_library(${GGML_CPU_NAME}) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) + list (APPEND GGML_CPU_SOURCES + ggml-cpu/ggml-cpu.c + ggml-cpu/ggml-cpu.cpp + ggml-cpu/ggml-cpu-aarch64.c + ggml-cpu/ggml-cpu-aarch64.h + ggml-cpu/ggml-cpu-quants.c + ggml-cpu/ggml-cpu-quants.h + ggml-cpu/amx/amx.cpp + ggml-cpu/amx/amx.h + ggml-cpu/amx/mmq.cpp + ggml-cpu/amx/mmq.h + ggml-cpu/ggml-cpu-impl.h + ) - target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - else() - message(WARNING "OpenMP not found") + target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) + target_include_directories(${GGML_CPU_NAME} PRIVATE . ggml-cpu) + + if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_ACCELERATE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_NEW_LAPACK) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ACCELERATE_LAPACK_ILP64) + + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() endif() -endif() -if (GGML_LLAMAFILE) - message(STATUS "Using llamafile") + if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_OPENMP) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) + target_link_libraries(${GGML_CPU_NAME} PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + else() + message(WARNING "OpenMP not found") + endif() + endif() - list(APPEND GGML_CPU_SOURCES - llamafile/sgemm.cpp - llamafile/sgemm.h) -endif() + if (GGML_LLAMAFILE) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_LLAMAFILE) -if (GGML_CPU_HBM) - find_library(memkind memkind REQUIRED) + list(APPEND GGML_CPU_SOURCES + ggml-cpu/llamafile/sgemm.cpp + ggml-cpu/llamafile/sgemm.h) + endif() - message(STATUS "Using memkind for CPU HBM") + if (GGML_CPU_HBM) + find_library(memkind memkind REQUIRED) - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) + message(STATUS "Using memkind for CPU HBM") - target_link_libraries(ggml-cpu PUBLIC memkind) -endif() + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_HBM) -if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR - CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND - NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind) + endif() - message(STATUS "ARM detected") + if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) - if (MSVC) - list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead - list(APPEND ARCH_DEFINITIONS __ARM_NEON) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) + message(STATUS "ARM detected") - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + if (MSVC) + list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead + list(APPEND ARCH_DEFINITIONS __ARM_NEON) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - message(STATUS "ARM feature DOTPROD enabled") - endif () + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - - message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - elseif (APPLE) - if (GGML_NATIVE) - set(USER_PROVIDED_MARCH FALSE) - foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) - if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") - set(USER_PROVIDED_MARCH TRUE) - break() - endif() - endforeach() - - if (NOT USER_PROVIDED_MARCH) - set(MARCH_FLAGS "-march=armv8.2a") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - - message(STATUS "ARM feature DOTPROD enabled") - endif () - - set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") - - set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) - - list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + message(STATUS "ARM feature DOTPROD enabled") endif () - endif () - else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") - list(APPEND ARCH_FLAGS -mfp16-format=ieee) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + + message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + elseif (APPLE) + if (GGML_NATIVE) + set(USER_PROVIDED_MARCH FALSE) + foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) + if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") + set(USER_PROVIDED_MARCH TRUE) + break() + endif() + endforeach() + + if (NOT USER_PROVIDED_MARCH) + set(MARCH_FLAGS "-march=armv8.2a") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) + + message(STATUS "ARM feature DOTPROD enabled") + endif () + + set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") + + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) + + message(STATUS "ARM feature MATMUL_INT8 enabled") + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") + endif () + endif () + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) endif() endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) - endif() - if (GGML_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) - endif() - endif() -elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) - message(STATUS "x86 detected") - if (MSVC) - # instruction set detection for MSVC only - if (GGML_NATIVE) - include(cmake/FindSIMD.cmake) - endif () - if (GGML_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (GGML_AVX512_VBMI) - list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + include(ggml-cpu/cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # /arch:AVX512 includes: __AVX512F__, __AVX512CD__, __AVX512BW__, __AVX512DQ__, and __AVX512VL__ + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + if (GGML_AVX512_VBMI) + list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_DEFINITIONS __AVX512VNNI__ GGML_AVX512_VNNI) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_DEFINITIONS __AVX512BF16__ GGML_AVX512_BF16) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_DEFINITIONS __AMX_TILE__ GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_DEFINITIONS __AMX_INT8__ GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_DEFINITIONS __AMX_BF16__ GGML_AMX_BF16) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2 GGML_FMA GGML_F16C) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + else () + list(APPEND ARCH_FLAGS /arch:SSE4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + endif() + if (GGML_AVX_VNNI) + # MSVC generates AVX512 with AVX-VNNI intrinsics even with /arch:AVX2 + #list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI) + endif() + else () + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + else () + list(APPEND ARCH_FLAGS -msse4.2) + list(APPEND ARCH_DEFINITIONS GGML_SSE42) + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + list(APPEND ARCH_DEFINITIONS GGML_F16C) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + list(APPEND ARCH_DEFINITIONS GGML_FMA) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + list(APPEND ARCH_DEFINITIONS GGML_AVX) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + list(APPEND ARCH_DEFINITIONS GGML_AVX2) + endif() + if (GGML_AVX_VNNI) + list(APPEND ARCH_FLAGS -mavxvnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX_VNNI) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512cd) + list(APPEND ARCH_FLAGS -mavx512vl) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + list(APPEND ARCH_DEFINITIONS GGML_AVX512) + endif() + if (GGML_AVX512_VBMI) list(APPEND ARCH_FLAGS -mavx512vbmi) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VBMI) endif() - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + if (GGML_AVX512_VNNI) list(APPEND ARCH_FLAGS -mavx512vnni) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_VNNI) endif() - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_DEFINITIONS __AVX512BF16__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + if (GGML_AVX512_BF16) list(APPEND ARCH_FLAGS -mavx512bf16) + list(APPEND ARCH_DEFINITIONS GGML_AVX512_BF16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + list(APPEND ARCH_DEFINITIONS GGML_AMX_TILE) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + list(APPEND ARCH_DEFINITIONS GGML_AMX_INT8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + list(APPEND ARCH_DEFINITIONS GGML_AMX_BF16) endif() endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_DEFINITIONS __AMX_TILE__) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_DEFINITIONS __AMX_INT8__) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_DEFINITIONS __AMX_BF16__) - endif() - elseif (GGML_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (GGML_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_DEFINITIONS __AVXVNNI__) - if (CMAKE_C_COMPILER_ID STREQUAL "Clang") - list(APPEND ARCH_FLAGS -mavxvnni) - endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) + endif() + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") + message(STATUS "RISC-V detected") + if (GGML_RVV) + list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) endif() else() - if (GGML_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (GGML_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (GGML_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (GGML_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (GGML_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (GGML_AVX_VNNI) - list(APPEND ARCH_FLAGS -mavxvnni) - endif() - if (GGML_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512dq) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (GGML_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_FLAGS -mamx-tile) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_FLAGS -mamx-int8) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_FLAGS -mamx-bf16) - endif() - endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") - message(STATUS "PowerPC detected") - execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) - string(FIND "${POWER10_M}" "POWER10" substring_index) - if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") - set(substring_index -1) + message(STATUS "Unknown architecture") endif() - if (${substring_index} GREATER_EQUAL 0) - list(APPEND ARCH_FLAGS -mcpu=power10) - elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") - list(APPEND ARCH_FLAGS -mcpu=powerpc64le) - else() - list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + if (GGML_CPU_AARCH64) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64) endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") - message(STATUS "loongarch64 detected") - list(APPEND ARCH_FLAGS -march=loongarch64) - if (GGML_LASX) - list(APPEND ARCH_FLAGS -mlasx) + message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}") + target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES}) + target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS}) + target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) + + if (GGML_BACKEND_DL) + # The feature detection code is compiled as a separate target so that + # it can be built without the architecture flags + # Since multiple variants of the CPU backend may be included in the same + # build, using set_source_files_properties() to set the arch flags is not possible + set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats) + add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp) + target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS}) + target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED) + set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME}) endif() - if (GGML_LSX) - list(APPEND ARCH_FLAGS -mlsx) + + if (EMSCRIPTEN) + set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128") endif() -elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") - message(STATUS "RISC-V detected") - if (GGML_RVV) - list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d) - endif() -else() - message(STATUS "Unknown architecture") -endif() - -if (GGML_CPU_AARCH64) - message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") - target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) -endif() - -target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") -set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") - -# the feature detection code must be compiled without any architecture flags -target_sources(ggml-cpu PRIVATE cpu-feats-x86.cpp) -# target_sources(ggml-cpu PRIVATE cpu-feats-arm.cpp) # TODO: ARM feature detection - -if (EMSCRIPTEN) - set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") -endif() +endfunction() diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 514701ffe..e8133d411 100644 --- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -1,4 +1,3 @@ -#include "ggml-cpu.h" #include "ggml-backend-impl.h" #if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) @@ -13,6 +12,7 @@ #include #include +// ref: https://cdrdv2-public.intel.com/782156/325383-sdm-vol-2abcd.pdf struct cpuid_x86 { bool SSE3(void) { return f_1_ecx[0]; } bool PCLMULQDQ(void) { return f_1_ecx[1]; } @@ -50,11 +50,15 @@ struct cpuid_x86 { bool INVPCID(void) { return f_7_ebx[10]; } bool RTM(void) { return is_intel && f_7_ebx[11]; } bool AVX512F(void) { return f_7_ebx[16]; } + bool AVX512DQ(void) { return f_7_ebx[17]; } bool RDSEED(void) { return f_7_ebx[18]; } bool ADX(void) { return f_7_ebx[19]; } bool AVX512PF(void) { return f_7_ebx[26]; } bool AVX512ER(void) { return f_7_ebx[27]; } bool AVX512CD(void) { return f_7_ebx[28]; } + bool AVX512BW(void) { return f_7_ebx[30]; } + bool AVX512VL(void) { return f_7_ebx[31]; } + bool SHA(void) { return f_7_ebx[29]; } bool PREFETCHWT1(void) { return f_7_ecx[0]; } @@ -259,36 +263,57 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - cpuid_x86 is; - // if the CPU backend was built with any features not supported by the current CPU, it cannot be used - if (ggml_cpu_has_fma() && !is.FMA()) { return 0; } - if (ggml_cpu_has_f16c() && !is.F16C()) { return 0; } - if (ggml_cpu_has_ssse3() && !is.SSSE3()) { return 0; } - if (ggml_cpu_has_sse3() && !is.SSE3()) { return 0; } - if (ggml_cpu_has_avx() && !is.AVX()) { return 0; } - if (ggml_cpu_has_avx_vnni() && !is.AVX_VNNI()) { return 0; } - if (ggml_cpu_has_avx2() && !is.AVX2()) { return 0; } - if (ggml_cpu_has_avx512() && !is.AVX512F()) { return 0; } - if (ggml_cpu_has_avx512_vbmi() && !is.AVX512_VBMI()) { return 0; } - if (ggml_cpu_has_avx512_bf16() && !is.AVX512_BF16()) { return 0; } - if (ggml_cpu_has_avx512_vnni() && !is.AVX512_VNNI()) { return 0; } - if (ggml_cpu_has_amx_int8() && !is.AMX_INT8()) { return 0; } - - // calculate a backend score based on the supported features - // more important features have a higher weight int score = 0; - score += ggml_cpu_has_fma () * 1; - score += ggml_cpu_has_f16c () * 1<<1; - score += ggml_cpu_has_ssse3 () * 1<<2; - score += ggml_cpu_has_sse3 () * 1<<3; - score += ggml_cpu_has_avx_vnni () * 1<<4; - score += ggml_cpu_has_avx () * 1<<5; - score += ggml_cpu_has_avx2 () * 1<<6; - score += ggml_cpu_has_avx512 () * 1<<7; - // score += ggml_cpu_has_avx512_vbmi() * 1<<8; // not used - score += ggml_cpu_has_avx512_bf16() * 1<<9; - score += ggml_cpu_has_avx512_vnni() * 1<<10; - score += ggml_cpu_has_amx_int8 () * 1<<11; + cpuid_x86 is; + +#ifdef GGML_FMA + if (!is.FMA()) { return 0; } + score += 1; +#endif +#ifdef GGML_F16C + if (!is.F16C()) { return 0; } + score += 1<<1; +#endif +#ifdef GGML_SSE42 + if (!is.SSE42()) { return 0; } + score += 1<<2; +#endif +#ifdef GGML_AVX + if (!is.AVX()) { return 0; } + score += 1<<4; +#endif +#ifdef GGML_AVX2 + if (!is.AVX2()) { return 0; } + score += 1<<5; +#endif +#ifdef GGML_AVX_VNNI + if (!is.AVX_VNNI()) { return 0; } + score += 1<<6; +#endif +#ifdef GGML_AVX512 + if (!is.AVX512F()) { return 0; } + if (!is.AVX512CD()) { return 0; } + if (!is.AVX512VL()) { return 0; } + if (!is.AVX512DQ()) { return 0; } + if (!is.AVX512BW()) { return 0; } + score += 1<<7; +#endif +#ifdef GGML_AVX512_VBMI + if (!is.AVX512_VBMI()) { return 0; } + score += 1<<8; +#endif +#ifdef GGML_AVX512_BF16 + if (!is.AVX512_BF16()) { return 0; } + score += 1<<9; +#endif +#ifdef GGML_AVX512_VNNI + if (!is.AVX512_VNNI()) { return 0; } + score += 1<<10; +#endif +#ifdef GGML_AMX_INT8 + if (!is.AMX_INT8()) { return 0; } + score += 1<<11; +#endif return score; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e4a9ca013..40ca7bb68 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -756,7 +756,7 @@ do { \ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else -static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { +static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 77e5d87a8..d3b4bdb96 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -641,7 +641,15 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } - // TODO: rename this + #ifdef GGML_USE_ACCELERATE + features.push_back({ "ACCELERATE", "1" }); + #endif + #ifdef GGML_USE_CPU_HBM + features.push_back({ "CPU_HBM", "1" }); + #endif + #ifdef GGML_USE_OPENMP + features.push_back({ "OPENMP", "1" }); + #endif #ifdef GGML_USE_CPU_AARCH64 features.push_back({ "AARCH64_REPACK", "1" }); #endif diff --git a/scripts/build-cpu.sh b/scripts/build-cpu.sh deleted file mode 100755 index 4b2ad816e..000000000 --- a/scripts/build-cpu.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -name="$1" -args="${@:2}" - -echo "Building $name with args: $args" - -rm -fr build-cpu-$1 -cmake -S . -B build-cpu-$1 -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF $args -cmake --build build-cpu-$1 --config Release -t ggml-cpu -j $(nproc) -cp build-cpu-$1/bin/libggml-cpu.so ./libggml-cpu-$1.so -rm -fr build-cpu-$1 From 1da7b765692764a8b33b08da61cbee63812a7bd9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 4 Dec 2024 22:38:20 +0200 Subject: [PATCH 06/99] server : fix speculative decoding with context shift (#10641) * server : fix speculative decoding with context shift ggml-ci * server : take into account speculative limits ggml-ci * server : add tests --- examples/server/server.cpp | 29 +++++++++++++++-- .../server/tests/unit/test_speculative.py | 31 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9bca3f30e..31dfd6240 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -921,6 +921,8 @@ struct server_context { slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); + slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2); + slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); if (slot.params.sampling.dry_base < 1.0f) { slot.params.sampling.dry_base = defaults.sampling.dry_base; @@ -2322,10 +2324,29 @@ struct server_context { continue; } + // determine the max draft that fits the current slot state + int n_draft_max = slot.params.speculative.n_max; + + // note: n_past is not yet increased for the `id` token sampled above + // also, need to leave space for 1 extra token to allow context shifts + n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2); + + if (slot.n_remaining > 0) { + n_draft_max = std::min(n_draft_max, slot.n_remaining - 1); + } + + SLT_DBG(slot, "max possible draft: %d\n", n_draft_max); + + if (n_draft_max < slot.params.speculative.n_min) { + SLT_DBG(slot, "the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min); + + continue; + } + llama_token id = slot.sampled; struct common_speculative_params params_spec; - params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_draft = n_draft_max; params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; params_spec.p_min = slot.params.speculative.p_min; @@ -2333,6 +2354,8 @@ struct server_context { // ignore small drafts if (slot.params.speculative.n_min > (int) draft.size()) { + SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min); + continue; } @@ -2344,6 +2367,8 @@ struct server_context { common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); } + SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens); + llama_decode(ctx, slot.batch_spec); // the accepted tokens from the speculation @@ -2372,7 +2397,7 @@ struct server_context { } } - SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past); } } diff --git a/examples/server/tests/unit/test_speculative.py b/examples/server/tests/unit/test_speculative.py index 982d6abb4..3bb5733cb 100644 --- a/examples/server/tests/unit/test_speculative.py +++ b/examples/server/tests/unit/test_speculative.py @@ -82,6 +82,37 @@ def test_different_draft_min_draft_max(): last_content = res.body["content"] +def test_slot_ctx_not_exceeded(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + + +def test_with_ctx_shift(): + global server + server.n_ctx = 64 + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "Hello " * 56, + "temperature": 0.0, + "top_k": 1, + "n_predict": 64, + "speculative.p_min": 0.0, + }) + assert res.status_code == 200 + assert len(res.body["content"]) > 0 + assert res.body["tokens_predicted"] == 64 + assert res.body["truncated"] == True + + @pytest.mark.parametrize("n_slots,n_requests", [ (1, 2), (2, 2), From f112d198cd9953b633ac662c2705d6e423438717 Mon Sep 17 00:00:00 2001 From: aryantandon01 <80969509+aryantandon01@users.noreply.github.com> Date: Thu, 5 Dec 2024 03:49:20 +0530 Subject: [PATCH 07/99] Update deprecation-warning.cpp (#10619) Fixed Path Separator Handling for Cross-Platform Support (Windows File Systems) --- examples/deprecation-warning/deprecation-warning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/deprecation-warning/deprecation-warning.cpp b/examples/deprecation-warning/deprecation-warning.cpp index 11b35d2c2..c2958ea12 100644 --- a/examples/deprecation-warning/deprecation-warning.cpp +++ b/examples/deprecation-warning/deprecation-warning.cpp @@ -12,7 +12,7 @@ int main(int argc, char** argv) { } // Get only the program name from the full path - auto pos = filename.find_last_of('/'); + auto pos = filename.find_last_of("/\\"); if (pos != std::string::npos) { filename = filename.substr(pos+1); } From d405804be84cfdac936f28b3446ce8cb988408d8 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 5 Dec 2024 08:47:55 +0100 Subject: [PATCH 08/99] py : update outdated copy-paste instructions [no ci] (#10667) This commit updates the copy-paste instruction in convert_hf_to_gguf_update.py to reflect that convert_hf_to_gguf.py will have already been updated with the new get_vocab_base_pre() function when this script completes. --- convert_hf_to_gguf_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 28cd02e5a..2a51fce2d 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -17,7 +17,7 @@ # # python3 convert_hf_to_gguf_update.py # -# - Copy-paste the generated get_vocab_base_pre() function into convert_hf_to_gguf.py +# - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated # - Update llama.cpp with the new pre-tokenizer if necessary # # TODO: generate tokenizer tests for llama.cpp From c2082d93a82d66dca112098c63775d7a97e6d47b Mon Sep 17 00:00:00 2001 From: PAB Date: Tue, 3 Dec 2024 20:20:04 +0100 Subject: [PATCH 09/99] ggml : add `GGML_PAD_REFLECT_1D` operation (ggml/1034) * ggml_pad_reflect_1d defined in header * implemented on CPU * called the forward pass * impl Metal kernel * added Metal kernel * added OP_PAD_REFLECT_1D in test-backend-ops.cpp * add test-pad-reflect-1d test case * test case support multiple backend --- ggml/include/ggml.h | 8 +++++ ggml/src/ggml-cpu/ggml-cpu.c | 39 +++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.m | 35 +++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 47 ++++++++++++++++++++++++++++ ggml/src/ggml.c | 37 ++++++++++++++++++++-- tests/test-backend-ops.cpp | 28 +++++++++++++++++ 6 files changed, 192 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 65cb92c44..1c8cc11b6 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -499,6 +499,7 @@ extern "C" { GGML_OP_POOL_2D_BACK, GGML_OP_UPSCALE, // nearest interpolate GGML_OP_PAD, + GGML_OP_PAD_REFLECT_1D, GGML_OP_ARANGE, GGML_OP_TIMESTEP_EMBEDDING, GGML_OP_ARGSORT, @@ -1695,6 +1696,13 @@ extern "C" { int p2, int p3); + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] + GGML_API struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1); + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 // timesteps: [N,] // return: [N, dim] diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 40ca7bb68..f82e877f0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -10439,6 +10439,40 @@ static void ggml_compute_forward_pad( } } +// ggml_compute_forward_pad_reflect_1d + +static void ggml_compute_forward_pad_reflect_1d( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t * opts = (const int32_t *) dst->op_params; + const int p0 = opts[0]; + const int p1 = opts[1]; + + GGML_TENSOR_UNARY_OP_LOCALS + + for (int64_t i3 = 0; i3 < ne3; i3++) { + for (int64_t i2 = 0; i2 < ne2; i2++) { + for (int64_t i1 = ith; i1 < ne1; i1 += nth) { + float * left = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + p0*nb0); + float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0); + + ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01)); + + for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0]; } + for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; } + } + } + } +} // ggml_compute_forward_arange @@ -12535,6 +12569,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_pad(params, tensor); } break; + case GGML_OP_PAD_REFLECT_1D: + { + ggml_compute_forward_pad_reflect_1d(params, tensor); + } break; case GGML_OP_ARANGE: { ggml_compute_forward_arange(params, tensor); @@ -12877,6 +12915,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 093ae9000..71f5525f3 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -310,6 +310,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_PAD_F32, + GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, GGML_METAL_KERNEL_TYPE_ARANGE_F32, GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, @@ -877,6 +878,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); @@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_POOL_2D: case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: @@ -3258,6 +3261,38 @@ static void ggml_metal_encode_node( const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_PAD_REFLECT_1D: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int32_t p0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t p1 = ((const int32_t *)(dst->op_params))[1]; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:6]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:11]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:12]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:13]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:14]; + [encoder setBytes:&p0 length:sizeof(p0) atIndex:15]; + [encoder setBytes:&p1 length:sizeof(p1) atIndex:16]; + + const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 5caa0846a..ee3e353ab 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2897,6 +2897,53 @@ kernel void kernel_pad_f32( } } +kernel void kernel_pad_reflect_1d_f32( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant int64_t & ne0, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant int32_t & p0, + constant int32_t & p1, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tgpg[[threadgroups_per_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3; + const int64_t i02 = i2; + const int64_t i01 = i1; + + device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01); + device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1); + + if (i1 < ne01 && i2 < ne02 && i3 < ne03) { + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + if (i0 < p0) { + dst_ptr[i0] = src0_ptr[p0 - i0]; + } else if (i0 < ne0 - p1) { + dst_ptr[i0] = src0_ptr[i0 - p0]; + } else { + dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1]; + } + } + } +} + kernel void kernel_arange_f32( device char * dst, constant int64_t & ne0, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1a9a7efaf..2c338dee5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "POOL_2D_BACK", "UPSCALE", "PAD", + "PAD_REFLECT_1D", "ARANGE", "TIMESTEP_EMBEDDING", "ARGSORT", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "pool_2d_back(x)", "upscale(x)", "pad(x)", + "pad_reflect_1d(x)", "arange(start, stop, step)", "timestep_embedding(timesteps, dim, max_period)", "argsort(x)", @@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); +static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4097,6 +4099,37 @@ struct ggml_tensor * ggml_pad( return result; } +// ggml_pad_reflect_1d + +struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1) { + GGML_ASSERT(p0 >= 0); + GGML_ASSERT(p1 >= 0); + + GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the + GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded + + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, + a->ne[0] + p0 + p1, + a->ne[1], + a->ne[2], + a->ne[3]); + + int32_t params[] = { p0, p1 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_PAD_REFLECT_1D; + result->src[0] = a; + + return result; +} + // ggml_arange struct ggml_tensor * ggml_arange( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 807d271c6..042eeaaab 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2697,6 +2697,33 @@ struct test_pad : public test_case { } }; +// GGML_OP_PAD_REFLECT_1D +struct test_pad_reflect_1d : public test_case { + const ggml_type type; + const std::array ne_a; + const int pad_0; + const int pad_1; + + std::string vars() override { + return VARS_TO_STR4(type, ne_a, pad_0, pad_1); + } + + test_pad_reflect_1d(ggml_type type = GGML_TYPE_F32, + std::array ne_a = {512, 34, 2, 1}, + int pad_0 = 10, int pad_1 = 9) + : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 2, ne_a.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_pad_reflect_1d(ctx, a, pad_0, pad_1); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_ARANGE struct test_arange : public test_case { const ggml_type type; @@ -3816,6 +3843,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); test_cases.emplace_back(new test_pad()); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_arange()); test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); From a8cbab201dcf4c76c30b8028427494d71f02bb57 Mon Sep 17 00:00:00 2001 From: PAB Date: Wed, 4 Dec 2024 09:19:30 +0100 Subject: [PATCH 10/99] ggml: add `GGML_SET` Metal kernel + i32 CPU kernel (ggml/1037) * implemented cpu kernel * add i32 test cases in test-backend-ops * typedef `ggml_metal_kargs_set` * implemented `kernel_set` * memcpy --- ggml/src/ggml-cpu/ggml-cpu.c | 80 ++++++++++++++++++++++++++- ggml/src/ggml-metal/ggml-metal-impl.h | 15 +++++ ggml/src/ggml-metal/ggml-metal.m | 76 +++++++++++++++++++++++++ ggml/src/ggml-metal/ggml-metal.metal | 32 +++++++++++ tests/test-backend-ops.cpp | 4 ++ 5 files changed, 206 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f82e877f0..f12b62e3b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1374,7 +1374,10 @@ struct ggml_compute_state { inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } -inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + +inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; } + inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } @@ -8248,6 +8251,77 @@ static void ggml_compute_forward_set_f32( } } +static void ggml_compute_forward_set_i32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // view src0 and dst with these strides and data offset inbytes during set + // nb0 is implicitly element_size because src0 and dst are contiguous + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + if (params->ith == 0) { + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + ggml_barrier(params->threadpool); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src1); + const int nc = src1->ne[0]; + + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + + // src0 and dst as viewed during set + const size_t nb0 = ggml_element_size(src0); + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); + + GGML_ASSERT(nb10 == sizeof(int32_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are viewed with shape of src1 and offset + // => same indices + const int i3 = ir/(ne12*ne11); + const int i2 = (ir - i3*ne12*ne11)/ne11; + const int i1 = (ir - i3*ne12*ne11 - i2*ne11); + + ggml_vec_cpy_i32(nc, + (int32_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + offset), + (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11)); + } +} + static void ggml_compute_forward_set( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8259,6 +8333,10 @@ static void ggml_compute_forward_set( { ggml_compute_forward_set_f32(params, dst); } break; + case GGML_TYPE_I32: + { + ggml_compute_forward_set_i32(params, dst); + } break; case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index e19586719..e3dc25f16 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -102,6 +102,21 @@ typedef struct { uint64_t nb3; } ggml_metal_kargs_cpy; +typedef struct { + int64_t ne10; + int64_t ne11; + int64_t ne12; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; + uint64_t offs; + bool inplace; +} ggml_metal_kargs_set; + typedef struct { int32_t ne00; int32_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 71f5525f3..f80fda7a4 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -372,6 +372,8 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, + GGML_METAL_KERNEL_TYPE_SET_I32, + GGML_METAL_KERNEL_TYPE_SET_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, @@ -940,6 +942,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H256, flash_attn_ext_vec_q5_0_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H256, flash_attn_ext_vec_q5_1_h256, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H256, flash_attn_ext_vec_q8_0_h256, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_F32, set_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_I32, set_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_BF16, cpy_f32_bf16, use_bfloat); @@ -1159,6 +1163,16 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return false; }; } + case GGML_OP_SET: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_I32: + return true; + default: + return false; + }; + } case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { @@ -3824,6 +3838,68 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_SET: + { + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + + // src0 and dst as viewed during set + const size_t dst_nb0 = ggml_element_size(src0); + + const size_t dst_nb1 = ((int32_t *) dst->op_params)[0]; + const size_t dst_nb2 = ((int32_t *) dst->op_params)[1]; + const size_t dst_nb3 = ((int32_t *) dst->op_params)[2]; + const size_t offset = ((int32_t *) dst->op_params)[3]; + const bool inplace = (bool) ((int32_t *) dst->op_params)[4]; + + if (!inplace) { + memcpy(((char *) dst->data), ((char *) src0->data), ggml_nbytes(dst)); + } + + const int im0 = (ne10 == 0 ? 0 : ne10-1); + const int im1 = (ne11 == 0 ? 0 : ne11-1); + const int im2 = (ne12 == 0 ? 0 : ne12-1); + const int im3 = (ne13 == 0 ? 0 : ne13-1); + + GGML_ASSERT(offset + im0*dst_nb0 + im1*dst_nb1 + im2*dst_nb2 + im3*dst_nb3 <= ggml_nbytes(dst)); + + id pipeline = nil; + + switch (src0t) { + case GGML_TYPE_F32: + GGML_ASSERT(nb10 == sizeof(float)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_F32].pipeline; break; + case GGML_TYPE_I32: + GGML_ASSERT(nb10 == sizeof(int32_t)); + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_I32].pipeline; break; + default: GGML_ABORT("fatal error"); + } + + ggml_metal_kargs_set args = { + /*.ne10 =*/ ne10, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + /*.nb1 =*/ dst_nb1, + /*.nb2 =*/ dst_nb2, + /*.nb3 =*/ dst_nb3, + /*.offs =*/ offset, + /*.inplace =*/ inplace, + }; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne10); + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_POOL_2D: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ee3e353ab..8ba43904d 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -3927,6 +3927,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_ #undef FA_TYPES +template +kernel void kernel_set( + constant ggml_metal_kargs_set & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int i13 = tgpig[2]; + const int i12 = tgpig[1]; + const int i11 = tgpig[0]; + + const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10; + + const int64_t i3 = n / (args.ne12*args.ne11*args.ne10); + const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10); + const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10; + + device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs); + + for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) { + device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10); + dst_data[i10] = (T) src[0]; + } +} + +typedef decltype(kernel_set) kernel_set_t; + +template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set; +template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set; + template kernel void kernel_cpy( constant ggml_metal_kargs_cpy & args, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 042eeaaab..9dd41260a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3521,6 +3521,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_set(GGML_TYPE_F32, GGML_TYPE_F32, {6, 5, 4, 3}, dim)); } + for (int dim = 1; dim < GGML_MAX_DIMS; ++dim) { + test_cases.emplace_back(new test_set(GGML_TYPE_I32, GGML_TYPE_I32, {6, 5, 4, 3}, dim)); + } + for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); From 0cd182ebcc2c45907240e727e80e8fbf0f5fdee9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 5 Dec 2024 13:27:42 +0200 Subject: [PATCH 11/99] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 27769c93b..47eae44f7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -b903ffe79daf18c0aaacbebe44a7b93a6b8d0982 +74d66b63eaf207a24f3e93bb922aba131cbf2906 From 6fe624783166e7355cec915de0094e63cd3558eb Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Thu, 5 Dec 2024 19:30:59 +0100 Subject: [PATCH 12/99] llama : add Minerva 7B model support (#10673) * Support for Minerva 7B * Update convert_hf_to_gguf_update.py --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + include/llama.h | 1 + src/llama-vocab.cpp | 1 + src/llama.cpp | 3 +++ 5 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d8df5cc00..9f1419e29 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -658,6 +658,9 @@ class Model: if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": # ref: https://huggingface.co/facebook/chameleon-7b res = "chameleon" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2a51fce2d..ce3c571df 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -102,6 +102,7 @@ models = [ {"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", }, {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, + {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, ] diff --git a/include/llama.h b/include/llama.h index 168c3fa1f..d121354c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -104,6 +104,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, + LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d1dc96276..8c9aaf5a0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { case LLAMA_VOCAB_PRE_TYPE_SMOLLM: case LLAMA_VOCAB_PRE_TYPE_CODESHELL: case LLAMA_VOCAB_PRE_TYPE_EXAONE: + case LLAMA_VOCAB_PRE_TYPE_MINERVA: regex_exprs = { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/src/llama.cpp b/src/llama.cpp index 00f78639e..ba4a9dfcf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -6479,6 +6479,9 @@ static void llm_load_vocab( vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON; vocab.tokenizer_add_bos = true; vocab.tokenizer_clean_spaces = false; + } else if ( + tokenizer_pre == "minerva-7b") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From c9c6e01daedac542b174c235872569fce5385982 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Thu, 5 Dec 2024 13:15:05 -0600 Subject: [PATCH 13/99] vulkan: Add VK_NV_cooperative_matrix2 support for mul_mat and flash attention (#10206) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 741 ++++++++++++++++-- .../ggml-vulkan/vulkan-shaders/CMakeLists.txt | 2 + .../vulkan-shaders/dequant_funcs_cm2.comp | 305 +++++++ .../vulkan-shaders/flash_attn_cm2.comp | 289 +++++++ .../vulkan-shaders/mul_mm_cm2.comp | 328 ++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 97 ++- 6 files changed, 1665 insertions(+), 97 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 07b45d6b9..c7ac0e8f7 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -167,6 +167,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool coopmat2; size_t idx; @@ -176,6 +177,7 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_matmul_f16_f32; vk_pipeline pipeline_matmul_split_k_reduce; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32; @@ -229,6 +231,14 @@ struct vk_device_struct { vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_pool2d_f32; + // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} + vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + std::unordered_map pipelines; std::unordered_map pipeline_descriptor_set_requirements; @@ -340,6 +350,40 @@ struct vk_mat_vec_id_push_constants { uint32_t nei0; uint32_t ne11; }; +struct vk_flash_attn_push_constants { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +}; + struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -1265,6 +1309,23 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events ); } +// number of rows/cols for flash attention shader +static constexpr uint32_t flash_attention_num_small_rows = 32; +static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) { + GGML_UNUSED(clamp); + + // small rows, large cols + if (small_rows) { + return {flash_attention_num_small_rows, 128}; + } + // small cols to reduce register count + if (ggml_is_quantized(type) || D == 256) { + return {64, 32}; + } + return {64, 64}; +}; + + static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1275,59 +1336,98 @@ static void ggml_vk_load_shaders(vk_device& device) { // mulmat std::vector l_warptile, m_warptile, s_warptile, - l_warptile_mmq, m_warptile_mmq, s_warptile_mmq; + l_warptile_mmq, m_warptile_mmq, s_warptile_mmq, + l_warptile_mmq_k, m_warptile_mmq_k, s_warptile_mmq_k, + l_warptile_mmqid, m_warptile_mmqid, s_warptile_mmqid; std::array l_wg_denoms, m_wg_denoms, s_wg_denoms, - l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms; + l_mmq_wg_denoms, m_mmq_wg_denoms, s_mmq_wg_denoms, + l_mmq_wg_denoms_k, m_mmq_wg_denoms_k, s_mmq_wg_denoms_k, + l_mmqid_wg_denoms, m_mmqid_wg_denoms, s_mmqid_wg_denoms; + uint32_t l_align, m_align, s_align; + if (device->coopmat2) { + // spec constants and tile sizes for non-quant matmul/matmul_id + l_warptile = { 256, 128, 256, 64 }; + m_warptile = { 256, 128, 128, 64 }; + s_warptile = { 128, 32, 16, 64 }; + l_wg_denoms = {128, 256, 1 }; + m_wg_denoms = {128, 128, 1 }; + s_wg_denoms = { 32, 16, 1 }; - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + // spec constants and tile sizes for quant matmul (non-Qi_K) + l_warptile_mmq = { 256, 128, 256, 64 }; + m_warptile_mmq = { 256, 128, 128, 64 }; + s_warptile_mmq = { 256, 128, 128, 64 }; + l_mmq_wg_denoms = { 128, 256, 1 }; + m_mmq_wg_denoms = { 128, 128, 1 }; + s_mmq_wg_denoms = { 128, 128, 1 }; - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + // spec constants and tile sizes for quant matmul (Qi_K) + l_warptile_mmq_k = { 256, 128, 512, 16 }; + m_warptile_mmq_k = { 256, 128, 256, 16 }; + s_warptile_mmq_k = { 256, 32, 128, 64 }; + l_mmq_wg_denoms_k = { 128, 512, 1 }; + m_mmq_wg_denoms_k = { 128, 256, 1 }; + s_mmq_wg_denoms_k = { 32, 128, 1 }; - l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; - m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; - s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + // spec constants and tile sizes for quant matmul_id + l_warptile_mmqid = { 256, 128, 128, 16 }; + m_warptile_mmqid = { 256, 128, 64, 16 }; + s_warptile_mmqid = { 256, 64, 64, 16 }; + l_mmqid_wg_denoms = { 128, 128, 1 }; + m_mmqid_wg_denoms = { 128, 64, 1 }; + s_mmqid_wg_denoms = { 64, 64, 1 }; - l_align = 128; - m_align = 64; - s_align = 32; + l_align = 128; + m_align = 64; + s_align = 32; + } else { + l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; + l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; + m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; + m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; + s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; + l_align = 128; + m_align = 64; + s_align = 32; - // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders - // and tile sizes, this should handle 16KB, 32KB, and 48KB+. - // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. - // But the numbers happen to work out for 32KB shared memory size that when using the medium - // size there's enough room for everything, and we assert for this. - uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - l_warptile = m_warptile; - l_wg_denoms = m_wg_denoms; - shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); - } - - shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { - if (device->properties.limits.maxComputeSharedMemorySize == 32768) { - l_warptile_mmq = m_warptile_mmq; - l_mmq_wg_denoms = m_mmq_wg_denoms; - } else { - l_warptile_mmq = s_warptile_mmq; - l_mmq_wg_denoms = s_mmq_wg_denoms; + // Fallback to smaller sizes if there's not enough shared memory. Given the current shaders + // and tile sizes, this should handle 16KB, 32KB, and 48KB+. + // This logic doesn't explicitly account for the 12KB row_ids in the mul_mat_mat_id shaders. + // But the numbers happen to work out for 32KB shared memory size that when using the medium + // size there's enough room for everything, and we assert for this. + uint32_t shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + l_warptile = m_warptile; + l_wg_denoms = m_wg_denoms; + shmem_needed = (l_warptile[1] + l_warptile[2]) * (l_warptile[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); - GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); - } - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - // assert mul_mat_mat_id shaders will fit. - GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + if (shmem_needed > device->properties.limits.maxComputeSharedMemorySize) { + if (device->properties.limits.maxComputeSharedMemorySize == 32768) { + l_warptile_mmq = m_warptile_mmq; + l_mmq_wg_denoms = m_mmq_wg_denoms; + } else { + l_warptile_mmq = s_warptile_mmq; + l_mmq_wg_denoms = s_mmq_wg_denoms; + } + shmem_needed = (l_warptile_mmq[1] + l_warptile_mmq[2]) * (l_warptile_mmq[3] + 1) * sizeof(float); + GGML_ASSERT(shmem_needed <= device->properties.limits.maxComputeSharedMemorySize); + } + if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { + // assert mul_mat_mat_id shaders will fit. + GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); + } } device->pipeline_matmul_f32 = std::make_shared(); @@ -1362,6 +1462,105 @@ static void ggml_vk_load_shaders(vk_device& device) { compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; +#if defined(VK_NV_cooperative_matrix2) + if (device->coopmat2) { + + auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { + return {fa_rows_cols(D, clamp, type, small_rows)[0], 1, 1}; + }; + + auto const &fa_spec_constants = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector { + // For large number of rows, 128 invocations seems to work best. + // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we + // can't use 256 for D==80. + uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128; + auto rows_cols = fa_rows_cols(D, clamp, type, small_rows); + return {wg_size, rows_cols[0], rows_cols[1], (D), clamp}; + }; + +#define CREATE_FA2(TYPE, NAMELC, D) \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1); \ + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len, flash_attn_f32_f16_ ## NAMELC ## _cm2_data, "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]); \ + +#define CREATE_FA(TYPE, NAMELC) \ + CREATE_FA2(TYPE, NAMELC, 64) \ + CREATE_FA2(TYPE, NAMELC, 80) \ + CREATE_FA2(TYPE, NAMELC, 96) \ + CREATE_FA2(TYPE, NAMELC, 112) \ + CREATE_FA2(TYPE, NAMELC, 128) \ + CREATE_FA2(TYPE, NAMELC, 256) + + CREATE_FA(GGML_TYPE_F16, f16) + CREATE_FA(GGML_TYPE_Q4_0, q4_0) + CREATE_FA(GGML_TYPE_Q4_1, q4_1) + CREATE_FA(GGML_TYPE_Q5_0, q5_0) + CREATE_FA(GGML_TYPE_Q5_1, q5_1) + CREATE_FA(GGML_TYPE_Q8_0, q8_0) + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //CREATE_FA(GGML_TYPE_Q2_K, q2_k) + //CREATE_FA(GGML_TYPE_Q3_K, q3_k) + //CREATE_FA(GGML_TYPE_Q4_K, q4_k) + //CREATE_FA(GGML_TYPE_Q5_K, q5_k) + //CREATE_FA(GGML_TYPE_Q6_K, q6_k) + CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl) +#undef CREATE_FA + + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3) + + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3) + CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) + + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) +#undef CREATE_MM +#undef CREATE_MM2 + } else +#endif if (device->fp16) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ @@ -1648,15 +1847,28 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + bool fp16_storage = false; + bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; + bool pipeline_robustness = false; + bool coopmat2_support = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; + } else if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; + } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { + pipeline_robustness = true; + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VULKAN_DISABLE_COOPMAT2")) { + coopmat2_support = true; } } @@ -1679,6 +1891,14 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct = (VkBaseOutStructure *)&sm_props; } +#if defined(VK_NV_cooperative_matrix2) + vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_props; + last_struct = (VkBaseOutStructure *)&coopmat2_props; + } +#endif + device->physical_device.getProperties2(&props2); device->properties = props2.properties; @@ -1701,20 +1921,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->shader_core_count = 0; } - bool fp16_storage = false; - bool fp16_compute = false; - bool pipeline_robustness = false; - - for (const auto& properties : ext_props) { - if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; - } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { - pipeline_robustness = true; - } - } - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; @@ -1757,22 +1963,112 @@ static vk_device ggml_vk_get_device(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + last_struct = (VkBaseOutStructure *)&vk12_features; + VkPhysicalDevicePipelineRobustnessFeaturesEXT pl_robustness_features; pl_robustness_features.pNext = nullptr; pl_robustness_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_ROBUSTNESS_FEATURES_EXT; pl_robustness_features.pipelineRobustness = VK_FALSE; if (pipeline_robustness) { - vk12_features.pNext = &pl_robustness_features; + last_struct->pNext = (VkBaseOutStructure *)&pl_robustness_features; + last_struct = (VkBaseOutStructure *)&pl_robustness_features; device_extensions.push_back("VK_EXT_pipeline_robustness"); } +#if defined(VK_NV_cooperative_matrix2) + VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; + coopmat2_features.pNext = nullptr; + coopmat2_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV; + if (coopmat2_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat2_features; + last_struct = (VkBaseOutStructure *)&coopmat2_features; + device_extensions.push_back("VK_NV_cooperative_matrix2"); + } +#endif + vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2); device->fp16 = device->fp16 && vk12_features.shaderFloat16; device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + if (coopmat2_support) { +#if defined(VK_NV_cooperative_matrix2) + if (coopmat2_features.cooperativeMatrixWorkgroupScope && + coopmat2_features.cooperativeMatrixFlexibleDimensions && + coopmat2_features.cooperativeMatrixReductions && + coopmat2_features.cooperativeMatrixConversions && + coopmat2_features.cooperativeMatrixPerElementOperations && + coopmat2_features.cooperativeMatrixTensorAddressing && + coopmat2_features.cooperativeMatrixBlockLoads && + vk12_features.bufferDeviceAddress) { + + std::vector flexible_dimensions; + uint32_t count = 0; + + PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = + (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV) + vk_instance.instance.getProcAddr("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV"); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, nullptr); + + VkCooperativeMatrixFlexibleDimensionsPropertiesNV empty_prop {}; + empty_prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV; + flexible_dimensions.resize(count, empty_prop); + + _vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(device->physical_device, &count, flexible_dimensions.data()); + + bool found_fp16_128 = false, + found_fp16_256 = false, + found_fp32_128 = false, + found_fp32_256 = false; + // need to support fp16*fp16 with fp16/fp32 accumulator, for workgroupsize 128 + // with 32x16x16 and 256 with 32x32x16. + for (auto &prop : flexible_dimensions) { + if (prop.saturatingAccumulation == VK_FALSE && + prop.scope == VK_SCOPE_WORKGROUP_KHR && + prop.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.BType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + + if (prop.workgroupInvocations == 128 && + prop.MGranularity <= 32 && + prop.NGranularity <= 16 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_128 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_128 = true; + } + } + if (prop.workgroupInvocations == 256 && + prop.MGranularity <= 32 && + prop.NGranularity <= 32 && + prop.KGranularity <= 16) { + if (prop.CType == VK_COMPONENT_TYPE_FLOAT16_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT16_KHR) { + found_fp16_256 = true; + } + if (prop.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && + prop.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR) { + found_fp32_256 = true; + } + } + } + } + if (found_fp16_128 && found_fp16_256 && + found_fp32_128 && found_fp32_256 && + coopmat2_props.cooperativeMatrixFlexibleDimensionsMaxDimension >= 512) { + device->coopmat2 = true; + } + } +#endif + } + if (!vk11_features.storageBuffer16BitAccess) { std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; throw std::runtime_error("Unsupported device"); @@ -2124,7 +2420,7 @@ static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type return ctx->device->pipeline_dequant[type]; } -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_pipeline(" << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f32; @@ -2132,14 +2428,23 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_f16_f32.f32acc; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_f16.f32acc; + if (prec == GGML_PREC_DEFAULT && ctx->device->coopmat2) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_f16.f32acc; + } } - if (src1_type != GGML_TYPE_F32) { + if (src1_type != GGML_TYPE_F32 && !ctx->device->coopmat2) { return nullptr; } @@ -2160,6 +2465,10 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return nullptr; } + if (ctx->device->coopmat2) { + assert(src1_type == GGML_TYPE_F16); + return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc; + } return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } @@ -2844,6 +3153,16 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, break; } + if (ctx->device->coopmat2) { + if ((m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) { + return aligned ? mmp->a_l : mmp->l; + } + if ((m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + if (m <= 32 || n <= 32) { return aligned ? mmp->a_s : mmp->s; } @@ -3008,18 +3327,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub } const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + // Reformat and convert to fp16 if src1 is non-contiguous, or for coopmat2 for better perf + const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) || + !ggml_vk_dim01_contiguous(src1); const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; if (qx_needs_dequant) { // Fall back to dequant + f16 mulmat - mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16); + mmp = ggml_vk_get_mul_mat_mat_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]); } // Not implemented @@ -3930,6 +4251,167 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx } } +static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * q, const ggml_tensor * k, const ggml_tensor * v, const ggml_tensor * mask, ggml_tensor * dst, bool dryrun = false) { + VK_LOG_DEBUG("ggml_vk_flash_attn((" << q << ", name=" << q->name << ", type=" << q->type << ", ne0=" << q->ne[0] << ", ne1=" << q->ne[1] << ", ne2=" << q->ne[2] << ", ne3=" << q->ne[3] << ", nb0=" << q->nb[0] << ", nb1=" << q->nb[1] << ", nb2=" << q->nb[2] << ", nb3=" << q->nb[3]; + std::cerr << "), (" << k << ", name=" << k->name << ", type=" << k->type << ", ne0=" << k->ne[0] << ", ne1=" << k->ne[1] << ", ne2=" << k->ne[2] << ", ne3=" << k->ne[3] << ", nb0=" << k->nb[0] << ", nb1=" << k->nb[1] << ", nb2=" << k->nb[2] << ", nb3=" << k->nb[3]; + std::cerr << "), (" << v << ", name=" << v->name << ", type=" << v->type << ", ne0=" << v->ne[0] << ", ne1=" << v->ne[1] << ", ne2=" << v->ne[2] << ", ne3=" << v->ne[3] << ", nb0=" << v->nb[0] << ", nb1=" << v->nb[1] << ", nb2=" << v->nb[2] << ", nb3=" << v->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; + std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const uint32_t nem1 = mask ? mask->ne[1] : 0; + const uint32_t nbm1 = mask ? mask->nb[1] : 0; + + const uint32_t D = neq0; + const uint32_t N = neq1; + const uint32_t KV = nek1; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne2 == N); + + // input tensor rows must be contiguous + GGML_ASSERT(nbq0 == ggml_type_size(q->type)); + GGML_ASSERT(nbk0 == ggml_type_size(k->type)); + GGML_ASSERT(nbv0 == ggml_type_size(v->type)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(nev1 == nek1); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + assert(dst->type == GGML_TYPE_F32); + assert(q->type == GGML_TYPE_F32); + assert(k->type == v->type); + + vk_pipeline *pipelines; + // XXX TODO other backends may be changing accumulator precision to default to f32 soon + bool f32acc = dst->op_params[3] == GGML_PREC_F32; + bool small_rows = N <= flash_attention_num_small_rows; + switch (D) { + case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break; + case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break; + case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break; + case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break; + case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break; + case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break; + default: + assert(!"unsupported D value"); + return; + } + assert(pipelines); + + bool aligned = (KV % pipelines[1]->align) == 0; + vk_pipeline pipeline = pipelines[aligned]; + assert(pipeline); + + if (dryrun) { + // Request descriptor sets + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return; + } + + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + + const uint32_t n_head_kv = neq2; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + ggml_vk_sync_buffers(subctx); + + vk_buffer d_Q, d_K, d_V, d_D, d_M; + uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset; + + bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset); + ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset); + ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset); + ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset); + Q_uma = d_Q != nullptr; + K_uma = d_K != nullptr; + V_uma = d_V != nullptr; + D_uma = d_D != nullptr; + if (mask) { + ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset); + M_uma = d_M != nullptr; + } + } + + + ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context; + ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context; + ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context; + + if (!Q_uma) { + d_Q = q_buf_ctx->dev_buffer; + q_buf_offset = vk_tensor_offset(q) + q->view_offs; + } + if (!K_uma) { + d_K = k_buf_ctx->dev_buffer; + k_buf_offset = vk_tensor_offset(k) + k->view_offs; + } + if (!V_uma) { + d_V = v_buf_ctx->dev_buffer; + v_buf_offset = vk_tensor_offset(v) + v->view_offs; + } + if (!D_uma) { + d_D = d_buf_ctx->dev_buffer; + d_buf_offset = vk_tensor_offset(dst) + dst->view_offs; + } + + if (!M_uma) { + d_M = d_Q; + m_buf_offset = q_buf_offset; + if (mask) { + ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context; + d_M = m_buf_ctx->dev_buffer; + m_buf_offset = vk_tensor_offset(mask) + mask->view_offs; + } + } + + const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, + }, + sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 }); +} + static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: @@ -5044,16 +5526,16 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); @@ -5391,16 +5873,16 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ggml_vk_buffer_write(y_buf, 0, y, y_sz); vk_context subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); + ggml_vk_ctx_begin(ctx->device, subctx); for (size_t i = 0; i < num_it; i++) { - ggml_vk_ctx_begin(ctx->device, subctx); ggml_vk_matmul( ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, split_k, batch, batch, batch, 1, 1 ); - ggml_vk_ctx_end(subctx); } + ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -5621,7 +6103,8 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 4096, 512, 11008, 32000, 512, 4096, }; - const size_t num_it = 1; + const size_t num_it = 100; + for (size_t i = 0; i < vals.size(); i += 3) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); @@ -5676,6 +6159,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod const ggml_tensor * src0 = node->src[0]; const ggml_tensor * src1 = node->src[1]; const ggml_tensor * src2 = node->src[2]; + const ggml_tensor * src3 = node->src[3]; switch (node->op) { // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor @@ -5728,6 +6212,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_LEAKY_RELU: + case GGML_OP_FLASH_ATTN_EXT: break; default: std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; @@ -5920,6 +6405,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_MUL_MAT_ID: ggml_vk_mul_mat_id(ctx, compute_ctx, src0, src1, src2, node, dryrun); + break; + + case GGML_OP_FLASH_ATTN_EXT: + ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun); + break; default: return false; @@ -6020,6 +6510,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: + case GGML_OP_FLASH_ATTN_EXT: buf = tensor->buffer; break; @@ -6751,6 +7242,57 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return true; } break; + case GGML_OP_FLASH_ATTN_EXT: + { + ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; + if (!ggml_vk_get_device(ctx->device)->coopmat2) { + return false; + } + switch (op->src[0]->ne[0]) { + case 64: + case 80: + case 96: + case 112: + case 128: + case 256: + break; + default: + return false; + } + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + if (op->type != GGML_TYPE_F32) { + return false; + } + if (op->src[3] && op->src[3]->type != GGML_TYPE_F16) { + return false; + } + // It's straightforward to support different K/V dequant, but would + // significantly increase the number of pipelines + if (op->src[1]->type != op->src[2]->type) { + return false; + } + switch (op->src[1]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently + //case GGML_TYPE_Q2_K: + //case GGML_TYPE_Q3_K: + //case GGML_TYPE_Q4_K: + //case GGML_TYPE_Q5_K: + //case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_NL: + break; + default: + return false; + } + return true; + } case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -7065,6 +7607,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; ggml_tensor * src2 = tensor->src[2]; + ggml_tensor * src3 = tensor->src[3]; struct ggml_init_params iparams = { /*.mem_size =*/ 2ul*1024ul*1024ul*1024ul, @@ -7077,15 +7620,18 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { struct ggml_tensor * src0_clone = nullptr; struct ggml_tensor * src1_clone = nullptr; struct ggml_tensor * src2_clone = nullptr; + struct ggml_tensor * src3_clone = nullptr; struct ggml_tensor * tensor_clone = nullptr; size_t src0_size; size_t src1_size; size_t src2_size; + size_t src3_size; void * src0_buffer = nullptr; void * src1_buffer = nullptr; void * src2_buffer = nullptr; + void * src3_buffer = nullptr; if (src0 != nullptr) { src0_clone = ggml_dup_tensor(ggml_ctx, src0); @@ -7213,8 +7759,53 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { ggml_vk_print_tensor(src2, "src2"); } } + if (src3 != nullptr) { + src3_clone = ggml_dup_tensor(ggml_ctx, src3); - if (tensor->op == GGML_OP_MUL_MAT) { + src3_size = ggml_nbytes(src3); + + src3_buffer = malloc(src3_size); + src3_clone->data = src3_buffer; + if (ggml_backend_buffer_is_host(src3->buffer)) { + memcpy(src3_clone->data, src3->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } else if (ggml_backend_buffer_is_vk(src3->buffer)) { + ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)src3->buffer->context; + vk_buffer& buffer_gpu = buf_ctx->dev_buffer; + uint64_t offset = vk_tensor_offset(src3) + src3->view_offs; + if (!ggml_is_contiguous(src3) && ggml_vk_dim01_contiguous(src3)) { + for (int i3 = 0; i3 < src3->ne[3]; i3++) { + for (int i2 = 0; i2 < src3->ne[2]; i2++) { + const int idx = i3*src3->ne[2] + i2; + ggml_vk_buffer_read(buffer_gpu, offset + idx * src3->nb[2], ((char *)src3_clone->data + idx * src3_clone->nb[2]), src3->ne[1] * src3->nb[1]); + } + } + + src3_clone->nb[0] = src3->nb[0]; + src3_clone->nb[1] = src3->nb[1]; + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src3_clone->nb[i] = src3_clone->nb[i - 1]*src3_clone->ne[i - 1]; + } + } else { + if (offset + src3_size >= buffer_gpu->size) { + src3_size = buffer_gpu->size - offset; + } + ggml_vk_buffer_read(buffer_gpu, offset, src3_clone->data, src3_size); + memcpy(src3_clone->nb, src3->nb, sizeof(size_t) * GGML_MAX_DIMS); + } + } else { + GGML_ABORT("fatal error"); + } + + if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { + ggml_vk_print_tensor(src3, "src3"); + } + } + + if (tensor->op == GGML_OP_FLASH_ATTN_EXT) { + const float *params = (const float *)tensor->op_params; + tensor_clone = ggml_flash_attn_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, src3_clone, params[0], params[1], params[2]); + } else if (tensor->op == GGML_OP_MUL_MAT) { tensor_clone = ggml_mul_mat(ggml_ctx, src0_clone, src1_clone); } else if (tensor->op == GGML_OP_MUL_MAT_ID) { tensor_clone = ggml_mul_mat_id(ggml_ctx, src0_clone, src1_clone, src2_clone); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 51c78b7d2..bd0c74cb1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -1,7 +1,9 @@ find_package (Threads REQUIRED) +find_package(Vulkan COMPONENTS glslc REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) +target_link_libraries(vulkan-shaders-gen PRIVATE Vulkan::Vulkan) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp new file mode 100644 index 000000000..a8707b621 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -0,0 +1,305 @@ + +#include "types.comp" + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ4_0 { + block_q4_0_packed16 block; +}; + +float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1]; + qs >>= shift; + qs &= 0xF; + float16_t ret = (float16_t(qs) - float16_t(8)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ4_1 { + block_q4_1 block; +}; + +float16_t dequantFuncQ4_1(const in decodeBufQ4_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(qs) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ5_0 { + block_q5_0 block; +}; + +float16_t dequantFuncQ5_0(const in decodeBufQ5_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = uint(bl.block.qh[1]) << 16 | bl.block.qh[0]; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = (float16_t(qs | qh) - float16_t(16)) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 8) buffer decodeBufQ5_1 { + block_q5_1 block; +}; + +float16_t dequantFuncQ5_1(const in decodeBufQ5_1 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const float16_t m = bl.block.m; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + + const uint uint_qh = bl.block.qh; + const uint qh = ((uint_qh >> idx) << 4) & 0x10; + + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + + float16_t ret = float16_t(qs | qh) * d + m; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ8_0 { + block_q8_0_packed16 block; +}; + +float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + // Load 16b and select the byte for this element + int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1]; + float16_t ret = float16_t(qs) * d; + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_K { + block_q2_K block; +}; + +float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const f16vec2 d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31 + const uint scalesi = iqs / 16; // 0..15 + const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6 + + uint32_t qs = bl.block.qs[qsi]; + const uint scales = bl.block.scales[scalesi]; + float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4); + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ3_K { + block_q3_K block; +}; + +float16_t dequantFuncQ3_K(const in decodeBufQ3_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint hmi = (iqs % 32); // 0..31 + const uint j = (iqs % 128) / 8; // 0..15 + const uint is = iqs / 16; // 0..15 + const uint halfsplit = ((iqs % 128) / 32); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + uint32_t scaleidx0 = (is < 8) ? is : (is-8); + uint32_t scaleidx0shift = (is < 8) ? 0 : 4; + uint32_t scaleidx1 = is + 8 - (is/4)*4; + uint32_t scaleidx1shift = (is/4)*2; + + const int8_t us = int8_t(((bl.block.scales[scaleidx0] >> scaleidx0shift) & 0xF) | (((bl.block.scales[scaleidx1] >> scaleidx1shift) & 3) << 4)); + + const float16_t dl = bl.block.d * float16_t(us - 32); + + float16_t ret = dl * float16_t(int8_t((bl.block.qs[qsi ] >> qsshift) & 3) - (((bl.block.hmask[hmi ] & m) != 0) ? 0 : 4)); + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K { + block_q4_K block; +}; + +float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K { + block_q5_K block; +}; + +float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 64; // 0,1,2,3 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is = (idx & 0xE0) >> 5; // 0..7 + const uint qsi = n * 32 + (iqs % 32); // 0..127 + const uint qhi = (iqs % 32); // 0..31 + + const uint8_t hm = uint8_t(1 << (iqs / 32)); + + const f16vec2 loadd = bl.block.d; + + uint32_t sc; + uint32_t mbyte; + + uint32_t scidx0 = (is < 4) ? is : (is + 4); + uint32_t scidx1 = (is < 4) ? is : (is - 4); + uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t scidxshift1 = (is < 4) ? 0 : 2; + uint32_t mbidx0 = is + 4; + uint32_t mbidx1 = (is < 4) ? is + 4 : is; + uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0; + uint32_t mbidxshift0 = (is < 4) ? 0 : 4; + uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + uint32_t mbidxshift1 = (is < 4) ? 0 : 2; + + sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1)); + mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float16_t d = loadd.x * float16_t(sc); + const float16_t m = loadd.y * float16_t(mbyte); + + uint32_t dmask = 0xF << (b * 4); + + float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m; + + return ret; +} + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_K { + block_q6_K block; +}; + +float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint idx = coordInBlock[1]; + const uint iqs = idx; + + const uint n = iqs / 128; // 0,1 + const uint b = (iqs % 128) / 64; // 0,1 + const uint is_b = (iqs % 32) / 16; // 0,1 + const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 64); // 0..127 + const uint qhi = n * 32 + (iqs % 32); // 0..63 + + const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]); + + float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32); + + return ret; +} + +#if defined(DATA_A_IQ4_NL) +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufIQ4_NL { + block_iq4_nl block; +}; + +float16_t dequantFuncIQ4_NL(const in decodeBufIQ4_NL bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const float16_t d = bl.block.d; + const uint idx = coordInBlock[1]; + const uint iqs = idx & 0xF; + const uint shift = (idx & 0x10) >> 2; + uint32_t qs = bl.block.qs[iqs]; + qs >>= shift; + qs &= 0xF; + float16_t ret = float16_t(kvalues_iq4nl[qs]) * d; + return ret; +} +#endif + +#if defined(DATA_A_Q4_0) +#define dequantFuncA dequantFuncQ4_0 +#elif defined(DATA_A_Q4_1) +#define dequantFuncA dequantFuncQ4_1 +#elif defined(DATA_A_Q5_0) +#define dequantFuncA dequantFuncQ5_0 +#elif defined(DATA_A_Q5_1) +#define dequantFuncA dequantFuncQ5_1 +#elif defined(DATA_A_Q8_0) +#define dequantFuncA dequantFuncQ8_0 +#elif defined(DATA_A_Q2_K) +#define dequantFuncA dequantFuncQ2_K +#elif defined(DATA_A_Q3_K) +#define dequantFuncA dequantFuncQ3_K +#elif defined(DATA_A_Q4_K) +#define dequantFuncA dequantFuncQ4_K +#elif defined(DATA_A_Q5_K) +#define dequantFuncA dequantFuncQ5_K +#elif defined(DATA_A_Q6_K) +#define dequantFuncA dequantFuncQ6_K +#elif defined(DATA_A_IQ4_NL) +#define dequantFuncA dequantFuncIQ4_NL +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp new file mode 100644 index 000000000..c5be8131b --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -0,0 +1,289 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable +#extension GL_EXT_null_initializer : enable + +#include "types.comp" +#include "dequant_funcs_cm2.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint32_t Br = 32; +layout (constant_id = 2) const uint32_t Bc = 32; +layout (constant_id = 3) const uint32_t D = 32; +layout (constant_id = 4) const uint32_t Clamp = gl_CooperativeMatrixClampModeConstantNV; + +layout (push_constant) uniform parameter { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb02; + uint32_t nb03; + uint32_t nb12; + uint32_t nb13; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +} p; + +layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; +layout (binding = 1) readonly buffer K {uint8_t data_k[];}; +layout (binding = 2) readonly buffer V {uint8_t data_v[];}; +layout (binding = 3) readonly buffer M {uint8_t data_m[];}; +layout (binding = 4) writeonly buffer O {D_TYPE data_o[];}; + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + +ACC_TYPE maxReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return max(x, y); +} + +ACC_TYPE smearReduce(const in ACC_TYPE x, const in ACC_TYPE y) { + return x; +} + +// Replace matrix elements >= numRows or numCols with 'replace' +ACC_TYPE replacePadding(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem, const in ACC_TYPE replace, const in uint32_t numRows, const in uint32_t numCols) { + if (row >= numRows || col >= numCols) { + return replace; + } + return elem; +} + +ACC_TYPE Exp(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem) +{ + return exp(elem); +} + +ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE elem0, const in ACC_TYPE elem1) +{ + return max(elem0, elem1); +} + +#if defined(BLOCK_SIZE) +#define DECODEFUNC , DEQUANTFUNC +#else +#define DECODEFUNC +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + + const uint32_t N = p.N; + const uint32_t KV = p.KV; + + const uint32_t Tr = CEIL_DIV(N, Br); + const uint32_t Tc = CEIL_DIV(KV, Bc); + + const uint32_t i = gl_WorkGroupID.x; + + const uint32_t iq2 = gl_WorkGroupID.y; + const uint32_t iq3 = gl_WorkGroupID.z; + + // broadcast factors + const uint32_t rk2 = p.neq2/p.nek2; + const uint32_t rk3 = p.neq3/p.nek3; + + const uint32_t rv2 = p.neq2/p.nev2; + const uint32_t rv3 = p.neq3/p.nev3; + + // k indices + const uint32_t ik3 = iq3 / rk3; + const uint32_t ik2 = iq2 / rk2; + + // v indices + const uint32_t iv3 = iq3 / rv3; + const uint32_t iv2 = iq2 / rv2; + + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutQ = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, Clamp> tensorLayoutK = createTensorLayoutNV(2, Clamp); + tensorLayoutNV<2, Clamp> tensorLayoutV = createTensorLayoutNV(2, Clamp); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if defined(BLOCK_SIZE) + tensorLayoutK = setTensorLayoutBlockSizeNV(tensorLayoutK, 1, BLOCK_SIZE); + tensorLayoutV = setTensorLayoutBlockSizeNV(tensorLayoutV, 1, BLOCK_SIZE); +#endif + + tensorLayoutQ = setTensorLayoutDimensionNV(tensorLayoutQ, N, D); + tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); + tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); + + coopmat Q; + coopmat Qf16; + + uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; + coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, D)); + + Qf16 = coopmat(Q); + Qf16 *= float16_t(p.scale); + + coopmat O = coopmat(0); + + coopmat L, M; + + L = coopmat(0); + M = coopmat(-1.0/0.0); + + ACC_TYPE slope = ACC_TYPE(1.0); + + // ALiBi + if (p.max_bias > 0.0f) { + const uint32_t h = iq2; + + const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + + slope = pow(base, ACC_TYPE(exph)); + } + + [[dont_unroll]] + for (uint32_t j = 0; j < Tc; ++j) { + + coopmat S = coopmat(0); + + coopmat K_T; + + uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, D), tensorViewTranspose DECODEFUNC); + S = coopMatMulAdd(Qf16, K_T, S); + + if (p.logit_softcap != 0.0f) { + [[unroll]] + for (int k = 0; k < S.length(); ++k) { + S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]); + } + } + + if (p.mask != 0) { + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); + + coopmat mv; + + coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); + + S += slope*coopmat(mv); + } + + // Clear padding elements to -inf, so they don't contribute to rowmax + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(S, S, replacePadding, ACC_TYPE(-1.0/0.0), R, C); + } + + coopmat rowmax, P, rowsum, eM; + + coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce); + + coopmat Mold = M; + + // M = max(rowmax, Mold) + // P = e^(S - M) + // eM = e^(Mold - M) + coopMatPerElementNV(M, rowmax, Max, Mold); + coopMatPerElementNV(P, S - M, Exp); + coopMatPerElementNV(eM, Mold - M, Exp); + + // Clear padding elements to 0, so they don't contribute to rowsum + if (Clamp != 0 && + ((j + 1) * Bc > KV || + (i + 1) * Br > N)) { + + uint R = ((i + 1) * Br > N) ? (N % Br) : Br; + uint C = ((j + 1) * Bc > KV) ? (KV % Bc) : Bc; + + coopMatPerElementNV(P, P, replacePadding, ACC_TYPE(0.0), R, C); + } + + coopmat P_A = coopmat(P); + + // compute rowsum by multiplying by matrix of all ones. + coopmat One = coopmat(1.0); + + rowsum = coopmat(0.0); + rowsum = coopMatMulAdd(P_A, One, rowsum); + + coopmat V; + uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, D) DECODEFUNC); + + L = eM*L + rowsum; + + // This is the "diagonal" matrix in the paper, but since we do componentwise + // multiply rather than matrix multiply it has the diagonal element smeared + // across the row + coopmat eMdiag; + + // resize eM by using smear/reduce + coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); + + O = eMdiag * O; + + O = coopMatMulAdd(P_A, V, O); + } + + coopmat Ldiag; + + // resize L by using smear/reduce + coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); + + [[unroll]] + for (int k = 0; k < Ldiag.length(); ++k) { + Ldiag[k] = ACC_TYPE(1.0) / Ldiag[k]; + } + + O = Ldiag*O; + + tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); + + // permute dimensions + tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); + uint32_t o_offset = iq3*p.ne2*p.ne1; + + coopmat O_D = coopmat(O); + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp new file mode 100644 index 000000000..cbfa5dce1 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -0,0 +1,328 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_EXT_shader_16bit_storage : require + +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require +#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require + +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_cooperative_matrix : enable +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_buffer_reference : enable +#extension GL_KHR_shader_subgroup_ballot : enable +#extension GL_KHR_shader_subgroup_vote : enable + +#include "types.comp" + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout (constant_id = 1) const uint BM = 64; +layout (constant_id = 2) const uint BN = 64; +layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant + +layout (push_constant) uniform parameter +{ + uint M; + uint N; + uint K; + uint stride_a; + uint stride_b; + uint stride_d; + + uint batch_stride_a; + uint batch_stride_b; + uint batch_stride_d; + +#ifdef MUL_MAT_ID + uint nei0; + uint nei1; + uint nbi1; + uint ne11; +#else + uint k_split; + uint ne02; + uint ne12; + uint broadcast2; + uint broadcast3; +#endif +} p; + + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +#if QUANT_K > 1 +#define DECODEFUNCA , dequantFuncA +#define MAT_A_TYPE float16_t + +#include "dequant_funcs_cm2.comp" + +#else +#define DECODEFUNCA +#define MAT_A_TYPE A_TYPE +#endif + +#define MAT_B_TYPE B_TYPE + +#ifdef MUL_MAT_ID +layout (binding = 3) readonly buffer IDS {int data_ids[];}; + +shared u16vec4 row_ids[3072]; + +layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { + B_TYPE b[]; +}; + +uint _ne1; +shared uint _ne1_sh; + +B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) +{ + const uint row_i = blockCoords[0]; + + if (row_i >= _ne1) { + return B_TYPE(0.0); + } + + const u16vec4 row_idx = row_ids[row_i]; + B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]]; + + return ret; +} + +D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic) +{ + uint dr = ir * BM + r; + uint dc = ic * BN + c; + + if (dr < p.M && dc < _ne1) { + uint row_i = dc; + const u16vec4 row_idx = row_ids[row_i]; + data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem; + } + return elem; +} + +#endif + +void main() { +#if defined(DATA_A_IQ4_NL) + init_iq4nl_shmem(); +#endif + +#ifdef MUL_MAT_ID + const uint expert_idx = gl_GlobalInvocationID.z; +#else + const uint batch_idx = gl_GlobalInvocationID.z; + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; +#endif + + const uint blocks_m = (p.M + BM - 1) / BM; + const uint ir = gl_WorkGroupID.x % blocks_m; + const uint ik = gl_WorkGroupID.x / blocks_m; + const uint ic = gl_WorkGroupID.y; + +#ifdef MUL_MAT_ID + // Spread the search across all elements in the first subgroup + if (gl_SubgroupID == 0) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + + for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) { + bool in_range = i < num_elements; + uint ii0 = i % p.nei0; + uint ii1 = i / p.nei0; + uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + uint idx = subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); + } + _ne1 += subgroupBallotBitCount(ballot); + } + _ne1_sh = _ne1; + } + + barrier(); + + _ne1 = _ne1_sh; + + // Workgroup has no work + if (ic * BN >= _ne1) return; +#endif + +#ifdef MUL_MAT_ID + uint start_k = 0; + const uint end_k = p.K; +#else + uint start_k = ik * p.k_split; + const uint end_k = min(p.K, (ik + 1) * p.k_split); +#endif + + coopmat sum; + sum = coopmat(0.0); + +#ifdef MUL_MAT_ID + uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K; + uint pos_b = 0; +#else + uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K; + uint pos_b = batch_idx * p.batch_stride_b; +#endif + + uint stride_a = p.stride_a / QUANT_K; + uint stride_b = p.stride_b; + + // Hint to the compiler that values are aligned (want 16B alignment). + // Quants are always block-aligned, no alignment needed. +#if ALIGNED +#if QUANT_K == 1 + stride_a &= ~7; +#endif + stride_b &= ~7; +#endif + + // Create layouts for both clamped and unclamped accesses + tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + +#if QUANT_K > 1 + tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K); + tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K); +#endif + + // Use end_k rather than p.K as the dimension because that's what + // we need to bound check against when using split_k + tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k); + tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M); + tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k); + tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k); + + tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); + +#if !defined(MUL_MAT_ID) + // Detect a fast path where all loads are entirely in bounds and no clamping is required + if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 && +#if QUANT_K == 1 + (stride_a % 8) == 0 && +#endif + (stride_b % 8) == 0 && (start_k % 8) == 0) { + // Hint to the compiler that values are aligned (want 16B alignment) + start_k &= ~7; + stride_b &= ~7; +#if QUANT_K == 1 + stride_a &= ~7; +#endif + + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + uint k_iters = (end_k - start_k + BK - 1) / BK; + + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopmat mat_a_ft = coopmat(mat_a); + + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); + coopmat mat_b_ft = coopmat(mat_b); + + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } else +#endif // !defined(MUL_MAT_ID) + { + tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); + + tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1); + + tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); + + tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1); + + [[dont_unroll]] + for (uint block_k = start_k; block_k < end_k; block_k += BK) { + + coopmat mat_a; + coopmat mat_b; + coopmat mat_a_ft; + coopmat mat_b_ft; + + // Clamping is expensive, so detect different code paths for each combination + // of A and B needing clamping. + bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0; +#ifdef MUL_MAT_ID + bool unclampedB = true; +#else + bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0; +#endif + if (unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); +#ifdef MUL_MAT_ID + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); +#else + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); +#endif + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } else if (!unclampedA && !unclampedB) { + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); + + mat_a_ft = coopmat(mat_a); + mat_b_ft = coopmat(mat_b); + sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum); + } + } + } + + // Convert from ACC_TYPE to D_TYPE + coopmat mat_d; + mat_d = coopmat(sum); + +#ifdef MUL_MAT_ID + // Call callback to store each element, remapping row through shared memory + coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); +#else + tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); + + uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); +#endif +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 5c317b68b..4716e2c80 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -30,6 +30,8 @@ #include #endif +#include + #define ASYNCIO_CONCURRENCY 64 std::mutex lock; @@ -196,15 +198,17 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { - std::string name = _name + (fp16 ? "" : "_fp32"); +void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { + std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); + std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + #ifdef _WIN32 - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; #else - std::vector cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname}; #endif #ifdef GGML_VULKAN_SHADER_DEBUG_INFO @@ -254,7 +258,7 @@ std::map merge_maps(const std::map> compiles; -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true) { +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { { // wait until fewer than N compiles are in progress. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. @@ -265,15 +269,15 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const } compile_count++; } - compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16)); + compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); } -void matmul_shaders(bool fp16, bool matmul_id) { - std::string load_vec = fp16 ? "8" : "4"; - std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4"; - std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4"; +void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { + std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; + std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; + std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; - std::map base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}}; + std::map base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}}; std::string shader_name = "matmul"; if (matmul_id) { @@ -285,21 +289,31 @@ void matmul_shaders(bool fp16, bool matmul_id) { base_dict["FLOAT16"] = "1"; } - // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; - string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16); + std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; + + // Shaders with f16 B_TYPE + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); // For unaligned, load one at a time for f32/f16, or two at a time for quants - std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2"; + std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2"; // For aligned matmul loads - std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2"; - string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16); + std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; + + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + + if (tname != "f16" && tname != "f32") { + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + } } } @@ -307,11 +321,50 @@ void process_shaders() { std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl; std::map base_dict = {{"FLOAT_TYPE", "float"}}; + // matmul for (const auto& fp16 : {false, true}) { - matmul_shaders(fp16, false); - matmul_shaders(fp16, true); + for (const auto& matmul_id : {false, true}) { + for (const auto& coopmat2 : {false, true}) { + for (const auto& f16acc : {false, true}) { +#if !defined(VK_NV_cooperative_matrix2) + if (coopmat2) { + continue; + } +#endif + if (coopmat2 && !fp16) { + continue; + } + if (!coopmat2 && f16acc) { + continue; + } + matmul_shaders(fp16, matmul_id, coopmat2, f16acc); + } + } + } } +#if defined(VK_NV_cooperative_matrix2) + // flash attention + for (const auto& f16acc : {false, true}) { + std::string acctype = f16acc ? "float16_t" : "float"; + + for (const auto& tname : type_names) { + if (tname == "f32") { + continue; + } + + if (tname == "f16") { + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); + } else { + std::string data_a_key = "DATA_A_" + to_uppercase(tname); + string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", + merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); + } + } + } +#endif + for (const auto& tname : type_names) { // mul mat vec std::string data_a_key = "DATA_A_" + to_uppercase(tname); From 7736837d62efed1dbebfe579472fca041eda12d6 Mon Sep 17 00:00:00 2001 From: Plamen Minev Date: Thu, 5 Dec 2024 23:36:41 +0200 Subject: [PATCH 14/99] fix(server) : not show alert when DONE is received (#10674) --- examples/server/public_simplechat/simplechat.js | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index 8e0df3b61..2fcd24a86 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -407,6 +407,9 @@ class SimpleChat { if (curLine.startsWith("data:")) { curLine = curLine.substring(5); } + if (curLine.trim() === "[DONE]") { + break; + } let curJson = JSON.parse(curLine); console.debug("DBUG:SC:PART:Json:", curJson); this.append_response(this.response_extract_stream(curJson, apiEP)); From 6c5bc0625fae6909cb40def15bc4bb45db6f7f4d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Dec 2024 11:14:32 +0100 Subject: [PATCH 15/99] server : (refactoring) do not rely on JSON internally (#10643) * server : (refactoring) reduce usage of json internally * move all response types to struct * wip [no ci] * many fixes * add virtual function * fix index * minor style fix * add std::move * refactor handle_completions_generic * add virtual functions * remove server.hpp * clarify server_sent_event RFC specs * apply review comments * fix model_alias and completion_probabilities * small clean up * remove virtual for to_json_oai_compat() * naming oai_compat --> oaicompat * fix unwanted recursive call * update docs --- common/common.h | 2 +- examples/server/README.md | 8 +- examples/server/server.cpp | 1341 +++++++++++------ examples/server/tests/README.md | 6 + examples/server/tests/tests.sh | 4 + .../server/tests/unit/test_chat_completion.py | 33 +- examples/server/tests/unit/test_completion.py | 39 + examples/server/utils.hpp | 249 +-- 8 files changed, 985 insertions(+), 697 deletions(-) diff --git a/common/common.h b/common/common.h index 0373fd3ea..95d20401d 100644 --- a/common/common.h +++ b/common/common.h @@ -215,7 +215,7 @@ struct common_params { struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_alias = ""; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string hf_repo = ""; // HF repo // NOLINT diff --git a/examples/server/README.md b/examples/server/README.md index b2dd7b65a..8dbed2626 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -473,9 +473,11 @@ Notice that each `probs` is an array of length `n_probs`. - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` -- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token -- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered -- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided +- `stop_type`: Indicating whether the completion has stopped. Possible values are: + - `none`: Generating (not stopped) + - `eos`: Stopped because it encountered the EOS token + - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered + - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 31dfd6240..809fafa18 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,8 +33,10 @@ using json = nlohmann::ordered_json; enum stop_type { - STOP_TYPE_FULL, - STOP_TYPE_PARTIAL, + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, }; // state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 @@ -69,12 +71,25 @@ enum server_task_inf_type { SERVER_TASK_INF_TYPE_INFILL, }; +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; + struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL llama_tokens prompt_tokens; server_task_type type; + + // TODO @ngxson : we should get rid of json type here json data; server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; @@ -89,15 +104,6 @@ struct server_task { } }; -struct server_task_result { - int id = -1; - - json data; - - bool stop; - bool error; -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -111,9 +117,603 @@ struct slot_params { int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit std::vector antiprompt; + bool timings_per_token = false; struct common_params_sampling sampling; struct common_params_speculative speculative; + + // params only used in to_json() + int32_t n_ctx; + uint32_t seed_cur; + bool can_speculative; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + json to_json() { + std::vector samplers; + samplers.reserve(sampling.samplers.size()); + for (const auto & sampler : sampling.samplers) { + samplers.emplace_back(common_sampler_type_to_str(sampler)); + } + + return json { + {"n_ctx", n_ctx}, + {"n_predict", n_predict}, // Server configured n_predict + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", sampling.dry_sequence_breakers}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"penalize_nl", sampling.penalize_nl}, + {"stop", antiprompt}, + {"max_tokens", n_predict}, // User configured n_predict + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + //{"logit_bias", sampling.logit_bias}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"grammar", sampling.grammar}, + {"samplers", samplers}, + {"speculative", can_speculative}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + }; + } +}; + +struct result_timings { + int32_t prompt_n = -1; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n = -1; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; + + json to_json() { + return { + {"prompt_n", prompt_n}, + {"prompt_ms", prompt_ms}, + {"prompt_per_token_ms", prompt_per_token_ms}, + {"prompt_per_second", prompt_per_second}, + + {"predicted_n", predicted_n}, + {"predicted_ms", predicted_ms}, + {"predicted_per_token_ms", predicted_per_token_ms}, + {"predicted_per_second", predicted_per_second}, + }; + } +}; + +struct server_task_result { + int id = -1; + int id_slot = -1; + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_partial + return false; + } + virtual int get_index() { + return -1; + } + virtual json to_json() = 0; + virtual ~server_task_result() = default; +}; + +// using shared_ptr for polymorphism of server_task_result +using server_task_result_ptr = std::unique_ptr; + +inline std::string stop_type_to_str(stop_type type) { + switch (type) { + case STOP_TYPE_EOS: return "eos"; + case STOP_TYPE_WORD: return "word"; + case STOP_TYPE_LIMIT: return "limit"; + default: return "none"; + } +} + +struct completion_token_output { + llama_token tok; + std::string text_to_send; + struct token_prob { + llama_token tok; + std::string tok_str; + float prob; + }; + std::vector probs; + + json to_json() const { + json probs_for_token = json::array(); + for (const auto & p : probs) { + probs_for_token.push_back(json { + {"tok_str", p.tok_str}, + {"prob", p.prob}, + }); + } + return probs_for_token; + } + + static json probs_vector_to_json(const std::vector & probs) { + json out = json::array(); + for (const auto & prob : probs) { + const std::string tok_str = prob.text_to_send; + out.push_back(json { + {"content", tok_str}, + {"probs", prob.to_json()}, + }); + } + return out; + } +}; + +struct server_task_result_cmpl_final : server_task_result { + int index = 0; + std::string content; + bool stream; + result_timings timings; + std::string prompt; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t n_tokens_cached; + int32_t has_new_line; + std::string stopping_word; + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + + slot_params generation_params; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { + json res = json { + {"index", index}, + {"content", content}, + {"id_slot", id_slot}, + {"stop", true}, + {"model", oaicompat_model}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + {"generation_settings", generation_params.to_json()}, + {"prompt", prompt}, + {"has_new_line", has_new_line}, + {"truncated", truncated}, + {"stop_type", stop_type_to_str(stop)}, + {"stopping_word", stopping_word}, + {"tokens_cached", n_tokens_cached}, + {"timings", timings.to_json()}, + }; + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } + return res; + } + + json to_json_oaicompat_chat() { + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{ + {"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{ + {"content", content}, + {"role", "assistant"} + } + }}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", oaicompat_model}, + {"object", "chat.completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json_non_oaicompat(); + } + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; + } +}; + +struct server_task_result_cmpl_partial : server_task_result { + int index = 0; + std::string content; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + result_timings timings; + + // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return stop != STOP_TYPE_NONE; + } + + virtual json to_json() override { + if (oaicompat) { + return to_json_oaicompat(); + } + bool is_stop = stop != STOP_TYPE_NONE; + // non-OAI-compat JSON + json res = json { + {"index", index}, + {"content", content}, + {"stop_type", stop_type_to_str(stop)}, + {"stop", is_stop}, + {"id_slot", id_slot}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + }; + // populate the timings object when needed (usually for the last response or with timings_per_token enabled) + if (timings.prompt_n > 0) { + res.push_back({"timings", timings.to_json()}); + } + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } + if (is_stop) { + res.push_back({"truncated", truncated}); + } + return res; + } + + json to_json_oaicompat() { + bool first = n_decoded == 0; + + std::string finish_reason; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } else if (stop == STOP_TYPE_LIMIT) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"} + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + if (!finish_reason.empty()) { + ret.push_back({"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}); + } + + return std::vector({ret}); + } +}; + +struct server_task_result_embd : server_task_result { + int index = 0; + std::vector embedding; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"embedding", embedding}, + }; + } +}; + +struct server_task_result_rerank : server_task_result { + int index = 0; + float score = -1e6; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"score", score}, + }; + } +}; + +// this function maybe used outside of server_task_result_error +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; + } + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; +} + +struct server_task_result_error : server_task_result { + int index = 0; + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + virtual bool is_error() override { + return true; + } + + virtual json to_json() override { + return format_error_response(err_msg, err_type); + } +}; + +struct server_task_result_metrics : server_task_result { + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; + + int32_t kv_cache_tokens_count; + int32_t kv_cache_used_cells; + + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // TODO: get rid of this json object and use to_json() instead + json slots_data = json::array(); + + virtual json to_json() override { + return json { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", n_tasks_deferred }, + { "t_start", t_start }, + + { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, + { "t_tokens_generation_total", t_tokens_generation_total }, + { "n_tokens_predicted_total", n_tokens_predicted_total }, + { "t_prompt_processing_total", t_prompt_processing_total }, + + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, + { "t_prompt_processing", t_prompt_processing }, + { "n_tokens_predicted", n_tokens_predicted }, + { "t_tokens_generation", t_tokens_generation }, + + { "n_decode_total", n_decode_total }, + { "n_busy_slots_total", n_busy_slots_total }, + + { "kv_cache_tokens_count", kv_cache_tokens_count }, + { "kv_cache_used_cells", kv_cache_used_cells }, + + { "slots", slots_data }, + }; + } +}; + +struct server_task_result_slot_save_load : server_task_result { + std::string filename; + bool is_save; // true = save, false = load + + size_t n_tokens; + size_t n_bytes; + double t_ms; + + virtual json to_json() override { + if (is_save) { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", n_tokens }, + { "n_written", n_bytes }, + { "timings", { + { "save_ms", t_ms } + }}, + }; + } else { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", n_tokens }, + { "n_read", n_bytes }, + { "timings", { + { "restore_ms", t_ms } + }}, + }; + } + } +}; + +struct server_task_result_slot_erase : server_task_result { + size_t n_erased; + + virtual json to_json() override { + return json { + { "id_slot", id_slot }, + { "n_erased", n_erased }, + }; + } +}; + +struct server_task_result_apply_lora : server_task_result { + virtual json to_json() override { + return json {{ "success", true }}; + } }; struct server_slot { @@ -162,15 +762,8 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; + stop_type stop; - bool timings_per_token = false; - - bool oaicompat = false; - - std::string oaicompat_model; std::string stopping_word; // sampling @@ -200,9 +793,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; + stop = STOP_TYPE_NONE; stopping_word = ""; n_past = 0; n_sent_text = 0; @@ -255,38 +846,40 @@ struct server_slot { } } - json get_formated_timings() const { - return json { - {"prompt_n", n_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, + result_timings get_timings() const { + result_timings timings; + timings.prompt_n = n_prompt_tokens_processed; + timings.prompt_ms = t_prompt_processing; + timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; + timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; + timings.predicted_n = n_decoded; + timings.predicted_ms = t_token_generation; + timings.predicted_per_token_ms = t_token_generation / n_decoded; + timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; + + return timings; } - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) { + size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; - if (type == STOP_TYPE_FULL) { + if (is_full_stop) { const size_t tmp = word.size() + last_token_size; const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; pos = text.find(word, from_pos); } else { + // otherwise, partial stop pos = find_partial_stop_string(word, text); } if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_TYPE_FULL) { - stopped_word = true; + if (is_full_stop) { + stop = STOP_TYPE_WORD; stopping_word = word; has_next_token = false; } @@ -511,8 +1104,8 @@ struct server_response { // for keeping track of all tasks waiting for the result std::unordered_set waiting_task_ids; - // the main result queue - std::vector queue_results; + // the main result queue (using ptr for polymorphism) + std::vector queue_results; std::mutex mutex_results; std::condition_variable condition_results; @@ -552,7 +1145,7 @@ struct server_response { } // This function blocks the thread until there is a response for one of the id_tasks - server_task_result recv(const std::unordered_set & id_tasks) { + server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ @@ -560,8 +1153,8 @@ struct server_response { }); for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i].id) != id_tasks.end()) { - server_task_result res = queue_results[i]; + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + server_task_result_ptr res = std::move(queue_results[i]); queue_results.erase(queue_results.begin() + i); return res; } @@ -572,21 +1165,21 @@ struct server_response { } // single-task version of recv() - server_task_result recv(int id_task) { + server_task_result_ptr recv(int id_task) { std::unordered_set id_tasks = {id_task}; return recv(id_tasks); } // Send a new result to a waiting id_task - void send(server_task_result & result) { - SRV_DBG("sending result for task id = %d\n", result.id); + void send(server_task_result_ptr && result) { + SRV_DBG("sending result for task id = %d\n", result->id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { - if (result.id == id_task) { - SRV_DBG("task id = %d moved to result queue\n", result.id); + if (result->id == id_task) { + SRV_DBG("task id = %d pushed to result queue\n", result->id); - queue_results.push_back(std::move(result)); + queue_results.emplace_back(std::move(result)); condition_results.notify_all(); return; } @@ -777,7 +1370,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props = slots[0].params.to_json(); default_generation_settings_for_props["seed"] = -1; // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens @@ -873,14 +1466,19 @@ struct server_context { const auto & data = task.data; if (data.count("__oaicompat") != 0) { - slot.oaicompat = true; - slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; + slot.params.oaicompat = true; + slot.params.oaicompat_chat = json_value(data, "__oaicompat_chat", false); + slot.params.oaicompat_model = json_value(data, "model", model_name); + slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); } else { - slot.oaicompat = false; - slot.oaicompat_model = ""; + slot.params.oaicompat = false; } - slot.timings_per_token = json_value(data, "timings_per_token", false); + + // enabling this will output extra debug information in the HTTP responses from the server + slot.params.verbose = params_base.verbosity > 9; + slot.params.timings_per_token = json_value(data, "timings_per_token", false); slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", true); @@ -1110,14 +1708,14 @@ struct server_context { const std::string str_test = slot.generated_text.substr(pos); bool send_text = true; - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); + size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); if (stop_pos != std::string::npos) { slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); } else if (slot.has_next_token) { - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); send_text = stop_pos == std::string::npos; } @@ -1141,7 +1739,7 @@ struct server_context { // check the limits if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); @@ -1150,7 +1748,7 @@ struct server_context { if (slot.has_new_line) { // if we have already seen a new line, we stop after a certain time limit if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); @@ -1170,7 +1768,7 @@ struct server_context { } if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // cut the last line @@ -1199,7 +1797,7 @@ struct server_context { // if context shift is disabled, we stop when it reaches the context limit if (slot.n_past >= slot.n_ctx) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", @@ -1207,7 +1805,7 @@ struct server_context { } if (llama_token_is_eog(model, result.tok)) { - slot.stopped_eos = true; + slot.stop = STOP_TYPE_EOS; slot.has_next_token = false; SLT_DBG(slot, "%s", "stopped by EOS\n"); @@ -1217,7 +1815,7 @@ struct server_context { if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // stop prediction SLT_WRN(slot, @@ -1231,60 +1829,6 @@ struct server_context { return slot.has_next_token; // continue } - json get_formated_generation(const server_slot & slot) const { - std::vector samplers; - samplers.reserve(slot.params.sampling.samplers.size()); - for (const auto & sampler : slot.params.sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - return json { - {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params_base.model_alias}, - {"seed", slot.params.sampling.seed}, - {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.params.sampling.temp}, - {"dynatemp_range", slot.params.sampling.dynatemp_range}, - {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, - {"top_k", slot.params.sampling.top_k}, - {"top_p", slot.params.sampling.top_p}, - {"min_p", slot.params.sampling.min_p}, - {"xtc_probability", slot.params.sampling.xtc_probability}, - {"xtc_threshold", slot.params.sampling.xtc_threshold}, - {"typical_p", slot.params.sampling.typ_p}, - {"repeat_last_n", slot.params.sampling.penalty_last_n}, - {"repeat_penalty", slot.params.sampling.penalty_repeat}, - {"presence_penalty", slot.params.sampling.penalty_present}, - {"frequency_penalty", slot.params.sampling.penalty_freq}, - {"dry_multiplier", slot.params.sampling.dry_multiplier}, - {"dry_base", slot.params.sampling.dry_base}, - {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, - {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, - {"mirostat", slot.params.sampling.mirostat}, - {"mirostat_tau", slot.params.sampling.mirostat_tau}, - {"mirostat_eta", slot.params.sampling.mirostat_eta}, - {"penalize_nl", slot.params.sampling.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"max_tokens", slot.params.n_predict}, // User configured n_predict - {"n_keep", slot.params.n_keep}, - {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.params.sampling.ignore_eos}, - {"stream", slot.params.stream}, - //{"logit_bias", slot.params.sampling.logit_bias}, - {"n_probs", slot.params.sampling.n_probs}, - {"min_keep", slot.params.sampling.min_keep}, - {"grammar", slot.params.sampling.grammar}, - {"samplers", samplers}, - {"speculative", slot.can_speculate()}, - {"speculative.n_max", slot.params.speculative.n_max}, - {"speculative.n_min", slot.params.speculative.n_min}, - {"speculative.p_min", slot.params.speculative.p_min}, - {"timings_per_token", slot.timings_per_token}, - }; - } - void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { send_error(task.id, error, type); } @@ -1296,28 +1840,33 @@ struct server_context { void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); - server_task_result res; - res.id = id_task; - res.stop = false; - res.error = true; - res.data = format_error_response(error, type); + auto res = std::make_unique(); + res->id = id_task; + res->err_type = type; + res->err_msg = error; - queue_results.send(res); + queue_results.send(std::move(res)); } void send_partial_response(server_slot & slot, completion_token_output tkn) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = false; - res.data = json { - {"content", tkn.text_to_send}, - {"stop", false}, - {"id_slot", slot.id}, - {"multimodal", false}, - {"index", slot.index}, - }; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; + res->content = tkn.text_to_send; + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + + res->stop = slot.stop; + + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); @@ -1325,83 +1874,74 @@ struct server_context { std::vector probs_output; if (probs_pos < probs_stop_pos) { - probs_output = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } - slot.n_sent_token_probs = probs_stop_pos; - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; + // populate timings if this is final response or timings_per_token is enabled + if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) { + res->timings = slot.get_timings(); } - if (slot.timings_per_token) { - res.data["timings"] = slot.get_formated_timings(); - } - - queue_results.send(res); + queue_results.send(std::move(res)); } - void send_final_response(const server_slot & slot) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; - res.data = json { - {"content", !slot.params.stream ? slot.generated_text : ""}, - {"id_slot", slot.id}, - {"stop", true}, - {"model", params_base.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", common_detokenize(ctx, slot.prompt_tokens)}, - {"has_new_line", slot.has_new_line}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}, - {"index", slot.index}, - }; + void send_final_response(server_slot & slot) { + if (slot.params.stream) { + // if in stream mode, send the last partial response + return send_partial_response(slot, {0, "", {}}); + } + auto res = std::make_unique(); + res->id = slot.id_task; + res->id_slot = slot.id; + + res->index = slot.index; + res->content = slot.generated_text; + res->timings = slot.get_timings(); + res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); + + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_tokens_cached = slot.n_past; + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { - std::vector probs; - if (!slot.params.stream && slot.stopped_word) { + if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - probs = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end() - safe_offset); } else { - probs = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end()); } - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); } - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; - } + res->generation_params = slot.params; // copy the parameters - queue_results.send(res); + queue_results.send(std::move(res)); } void send_embedding(const server_slot & slot, const llama_batch & batch) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; const int n_embd = llama_n_embd(model); @@ -1420,32 +1960,23 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"embedding", std::vector(n_embd, 0.0f)}, - {"index", slot.index}, - }; - + res->embedding = std::vector(n_embd, 0.0f); continue; } common_embd_normalize(embd, embd_res.data(), n_embd); - - res.data = json { - {"embedding", embd_res}, - {"index", slot.index}, - }; + res->embedding = embd_res; } SLT_DBG(slot, "%s", "sending embeddings\n"); - queue_results.send(res); + queue_results.send(std::move(res)); } void send_rerank(const server_slot & slot, const llama_batch & batch) { - server_task_result res; - res.id = slot.id_task; - res.error = false; - res.stop = true; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -1460,23 +1991,16 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"index", slot.index}, - {"score", -1e6}, - }; - + res->score = -1e6; continue; } - res.data = json { - {"index", slot.index}, - {"score", embd[0]}, - }; + res->score = embd[0]; } - SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str()); + SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); - queue_results.send(res); + queue_results.send(std::move(res)); } // @@ -1567,49 +2091,54 @@ struct server_context { } // receive the results from task(s) created by create_tasks_inference - void receive_cmpl_results( + void receive_multi_results( const std::unordered_set & id_tasks, - const std::function&)> & result_handler, + const std::function&)> & result_handler, const std::function & error_handler) { - // TODO: currently, there is no way to detect the client has cancelled the request - std::vector results(id_tasks.size()); + std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { - server_task_result result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv(id_tasks); - if (result.error) { - error_handler(result.data); + if (result->is_error()) { + error_handler(result->to_json()); cancel_tasks(id_tasks); return; } - const size_t idx = result.data["index"]; + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); + const size_t idx = result->get_index(); GGML_ASSERT(idx < results.size() && "index out of range"); - - results[idx] = result; + results[idx] = std::move(result); } result_handler(results); } // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( - const std::unordered_set & id_tasks, const - std::function & result_handler, const - std::function & error_handler) { + const std::unordered_set & id_tasks, + const std::function & result_handler, + const std::function & error_handler) { size_t n_finished = 0; while (true) { - server_task_result result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv(id_tasks); + + if (result->is_error()) { + error_handler(result->to_json()); + cancel_tasks(id_tasks); + return; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); if (!result_handler(result)) { cancel_tasks(id_tasks); break; } - if (result.error) { - error_handler(result.data); - cancel_tasks(id_tasks); - break; - } - - if (result.stop) { + if (result->is_stop()) { if (++n_finished == id_tasks.size()) { break; } @@ -1676,7 +2205,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = get_formated_generation(slot); + json slot_data = slot.params.to_json(); slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["is_processing"] = slot.is_processing(); @@ -1686,9 +2215,6 @@ struct server_context { {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; @@ -1702,39 +2228,33 @@ struct server_context { } SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); - server_task_result res; - res.id = task.id; - res.stop = true; - res.error = false; - res.data = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", queue_tasks.queue_tasks_deferred.size() }, - { "t_start", metrics.t_start}, + auto res = std::make_unique(); + res->id = task.id; + res->n_idle_slots = n_idle_slots; + res->n_processing_slots = n_processing_slots; + res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res->t_start = metrics.t_start; - { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, - { "t_tokens_generation_total", metrics.t_tokens_generation_total}, - { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, - { "t_prompt_processing_total", metrics.t_prompt_processing_total}, + res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); + res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); - { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, - { "t_prompt_processing", metrics.t_prompt_processing}, - { "n_tokens_predicted", metrics.n_tokens_predicted}, - { "t_tokens_generation", metrics.t_tokens_generation}, + res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; + res->t_prompt_processing_total = metrics.t_prompt_processing_total; + res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; + res->t_tokens_generation_total = metrics.t_tokens_generation_total; - { "n_decode_total", metrics.n_decode_total}, - { "n_busy_slots_total", metrics.n_busy_slots_total}, + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; + res->t_prompt_processing = metrics.t_prompt_processing; + res->n_tokens_predicted = metrics.n_tokens_predicted; + res->t_tokens_generation = metrics.t_tokens_generation; - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - - { "slots", slots_data }, - }; + res->n_decode_total = metrics.n_decode_total; + res->n_busy_slots_total = metrics.n_busy_slots_total; if (json_value(task.data, "reset_bucket", false)) { metrics.reset_bucket(); } - queue_results.send(res); + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { @@ -1762,20 +2282,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", token_count }, // tokens saved - { "n_written", nwrite }, // bytes written - { "timings", { - { "save_ms", t_save_ms } - } } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = true; + res->n_tokens = token_count; + res->n_bytes = nwrite; + res->t_ms = t_save_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { @@ -1810,20 +2325,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", token_count }, // tokens restored - { "n_read", nread }, // bytes read - { "timings", { - { "restore_ms", t_restore_ms } - } } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = false; + res->n_tokens = token_count; + res->n_bytes = nread; + res->t_ms = t_restore_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_ERASE: { @@ -1845,25 +2355,18 @@ struct server_context { llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "n_erased", n_erased } - }; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->n_erased = n_erased; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SET_LORA: { common_lora_adapters_apply(ctx, loras); - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json{{ "success", true }}; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + queue_results.send(std::move(res)); } break; } } @@ -2298,9 +2801,11 @@ struct server_context { const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + auto tok_id = cur_p->data[i].id; result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, + tok_id, + tokens_to_output_formatted_string(ctx, tok_id), + i >= cur_p->size ? 0.0f : cur_p->data[i].p, }); } @@ -2452,17 +2957,9 @@ int main(int argc, char ** argv) { common_init(); - // enabling this will output extra debug information in the HTTP responses from the server - // see format_final_response_oaicompat() - const bool verbose = params.verbosity > 9; - // struct that contains llama context and inference server_context ctx_server; - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - llama_backend_init(); llama_numa_init(params.numa); @@ -2647,19 +3144,27 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); + // optionally return "fail_on_no_slot" error - const int n_idle_slots = result.data.at("idle"); if (req.has_param("fail_on_no_slot")) { - if (n_idle_slots == 0) { + if (res_metrics->n_idle_slots == 0) { res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); return; } } - res_ok(res, result.data.at("slots")); + res_ok(res, res_metrics->slots_data); }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { @@ -2679,73 +3184,69 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - json data = result.data; + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } - const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed"); - const uint64_t t_prompt_processing = data.at("t_prompt_processing"); - - const uint64_t n_tokens_predicted = data.at("n_tokens_predicted"); - const uint64_t t_tokens_generation = data.at("t_tokens_generation"); - - const uint64_t n_decode_total = data.at("n_decode_total"); - const uint64_t n_busy_slots_total = data.at("n_busy_slots_total"); - - const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names json all_metrics_def = json { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")} + {"value", (uint64_t) res_metrics->n_prompt_tokens_processed_total} }, { {"name", "prompt_seconds_total"}, {"help", "Prompt process time"}, - {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3} + {"value", (uint64_t) res_metrics->t_prompt_processing_total / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) data.at("n_tokens_predicted_total")} + {"value", (uint64_t) res_metrics->n_tokens_predicted_total} }, { {"name", "tokens_predicted_seconds_total"}, {"help", "Predict process time"}, - {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3} + {"value", (uint64_t) res_metrics->t_tokens_generation_total / 1.e3} }, { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, - {"value", n_decode_total} + {"value", res_metrics->n_decode_total} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) n_busy_slots_total / (float) n_decode_total} + {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} + {"value", res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} + {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * kv_cache_used_cells / params.n_ctx} + {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx} },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", (uint64_t) data.at("kv_cache_tokens_count")} + {"value", (uint64_t) res_metrics->kv_cache_tokens_count} },{ {"name", "requests_processing"}, {"help", "Number of request processing."}, - {"value", (uint64_t) data.at("processing")} + {"value", (uint64_t) res_metrics->n_processing_slots} },{ {"name", "requests_deferred"}, {"help", "Number of request deferred."}, - {"value", (uint64_t) data.at("deferred")} + {"value", (uint64_t) res_metrics->n_tasks_deferred} }}} }; @@ -2766,8 +3267,7 @@ int main(int argc, char ** argv) { } } - const int64_t t_start = data.at("t_start"); - res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); + res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start)); res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK @@ -2793,14 +3293,15 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + res_ok(res, result->to_json()); }; const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { @@ -2823,14 +3324,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { @@ -2843,14 +3346,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result->is_error()) { + res_error(res, result->to_json()); + return; } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { @@ -2905,12 +3410,19 @@ int main(int argc, char ** argv) { res_ok(res, {{ "success", true }}); }; - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { + // handle completion-like requests (completion, chat, infill) + // we can optionally provide a custom format for partial results and final results + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( + server_task_inf_type inf_type, + json & data, + httplib::Response & res, + bool oai_compat = false) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } + data["completion_id"] = gen_chatcmplid(); std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2919,15 +3431,15 @@ int main(int argc, char ** argv) { const auto task_ids = server_task::get_list_id(tasks); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { if (results.size() == 1) { // single result - res_ok(res, results[0].data); + res_ok(res, results[0]->to_json()); } else { // multiple results (multitask) json arr = json::array(); - for (const auto & res : results) { - arr.push_back(res.data); + for (auto & res : results) { + arr.push_back(res->to_json()); } res_ok(res, arr); } @@ -2937,12 +3449,26 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - return server_sent_event(sink, "data", result.data); + const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { + ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { + json res_json = result->to_json(); + if (res_json.is_array()) { + for (const auto & res : res_json) { + if (!server_sent_event(sink, "data", res)) { + return false; + } + } + return true; + } else { + return server_sent_event(sink, "data", res_json); + } }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); + if (oai_compat) { + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); + } sink.done(); return false; }; @@ -3010,61 +3536,15 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; - // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - - std::vector tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION); - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); - - bool stream = json_value(data, "stream", false); - const auto task_ids = server_task::get_list_id(tasks); - const auto completion_id = gen_chatcmplid(); - - if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { - // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); - res_ok(res, result_oai); - }, [&](const json & error_data) { - res_error(res, error_data); - }); - - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - } else { - const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); - for (auto & event_data : result_array) { - if (event_data.empty()) { - continue; // skip the stop token - } - if (!server_sent_event(sink, "data", event_data)) { - return false; // connection is closed - } - } - return true; // ok - }, [&](const json & error_data) { - server_sent_event(sink, "error", error_data); - }); - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); - sink.done(); - return true; - }; - - auto on_complete = [task_ids, &ctx_server] (bool) { - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } + data["__oaicompat_chat"] = true; + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); }; const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { @@ -3139,12 +3619,12 @@ int main(int argc, char ** argv) { const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - bool is_openai = false; + bool oaicompat = false; // an input prompt can be a string or a list of tokens (integer) json prompt; if (body.count("input") != 0) { - is_openai = true; + oaicompat = true; prompt = body.at("input"); } else if (body.count("content") != 0) { // with "content", we only support single prompt @@ -3165,9 +3645,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3182,7 +3663,7 @@ int main(int argc, char ** argv) { } // write JSON response - json root = is_openai + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : responses[0]; res_ok(res, root); @@ -3243,9 +3724,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3301,11 +3783,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - res_ok(res, result.data); - res.status = 200; // HTTP OK + if (result->is_error()) { + res_error(res, result->to_json()); + return; + } + + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; // diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 2930a2e0d..fa3d0a2f5 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d DEBUG=1 ./tests.sh -s -v -x ``` +Hint: You can compile and run test in single command, useful for local developement: + +```shell +cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh +``` + To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html) diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 1e285dcda..1e0777de3 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -1,5 +1,9 @@ #!/bin/bash +# make sure we are in the right directory +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + set -eu if [ $# -lt 1 ] diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 8a439f9ef..f13c6c4ca 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -12,13 +12,13 @@ def create_server(): @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server server.start() res = server.make_request("POST", "/chat/completions", data={ @@ -30,29 +30,27 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 + assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted choice = res.body["choices"][0] assert "assistant" == choice["message"]["role"] assert match_regex(re_content, choice["message"]["content"]) - if truncated: - assert choice["finish_reason"] == "length" - else: - assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server + server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL server.start() res = server.make_stream_request("POST", "/chat/completions", data={ - "model": model, "max_tokens": max_tokens, "messages": [ {"role": "system", "content": system_prompt}, @@ -63,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r content = "" for data in res: choice = data["choices"][0] + assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted assert "content" not in choice["delta"] assert match_regex(re_content, content) - # FIXME: not sure why this is incorrect in stream mode - # if truncated: - # assert choice["finish_reason"] == "length" - # else: - # assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason else: assert choice["finish_reason"] is None content += choice["delta"]["content"] @@ -93,7 +88,7 @@ def test_chat_completion_with_openai_library(): temperature=0.8, ) print(res) - assert res.choices[0].finish_reason == "stop" + assert res.choices[0].finish_reason == "length" assert res.choices[0].message.content is not None assert match_regex("(Suddenly)+", res.choices[0].message.content) diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 2fa30dd03..1c3aa77de 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp content += data["content"] +def test_completion_stream_vs_non_stream(): + global server + server.start() + res_stream = server.make_stream_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + "stream": True, + }) + res_non_stream = server.make_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + }) + content_stream = "" + for data in res_stream: + content_stream += data["content"] + assert content_stream == res_non_stream.body["content"] + + @pytest.mark.parametrize("n_slots", [1, 2]) def test_consistent_result_same_seed(n_slots: int): global server @@ -221,3 +239,24 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int): assert len(res.body["content"]) > 10 # FIXME: the result is not deterministic when using other slot than slot 0 # assert match_regex(re_content, res.body["content"]) + + +def test_n_probs(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + }) + assert res.status_code == 200 + assert "completion_probabilities" in res.body + assert len(res.body["completion_probabilities"]) == 5 + for tok in res.body["completion_probabilities"]: + assert "probs" in tok + assert len(tok["probs"]) == 10 + for prob in tok["probs"]: + assert "prob" in prob + assert "tok_str" in prob + assert 0.0 <= prob["prob"] <= 1.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e4451532c..a96116ac3 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" @@ -40,17 +41,6 @@ using json = nlohmann::ordered_json; #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -485,48 +475,11 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } -struct completion_token_output { - llama_token tok; - std::string text_to_send; - - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; -}; - -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { - json out = json::array(); - - for (const auto & prob : probs) { - json probs_for_token = json::array(); - - for (const auto & p : prob.probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - - const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json { - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - - return out; -} - static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { const std::string str = std::string(event) + ": " + data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain) + "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). LOG_DBG("data stream, to_send: %s", str.c_str()); @@ -604,164 +557,6 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = json { - {"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}, - {"id", completion_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - } - - if (result.contains("timings")) { - res.push_back({"timings", json_value(result, "timings", json::object())}); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({result}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"} - }; - - if (result.contains("timings")) { - ret.push_back({"timings", json_value(result, "timings", json::object())}); - } - - if (!finish_reason.empty()) { - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - ret.push_back({"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} - }}); - } - - return std::vector({ret}); -} - static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; @@ -853,43 +648,3 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } - -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} From f162d45a21b27f7613f14539f9a4932d6ff3ca48 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Dec 2024 13:29:05 +0100 Subject: [PATCH 16/99] common : bring back --no-warmup to server (#10686) --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 078c75384..0db59f701 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -786,7 +786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spm-infill"}, string_format( From c5ede3849fc021174862f9c0bf8273808d8f0d39 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Dec 2024 21:33:15 +0200 Subject: [PATCH 17/99] convert : add custom attention mapping --- gguf-py/gguf/tensor_mapping.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 1b6a3f4ad..f0a7b6478 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -146,6 +146,7 @@ class TensorNameMap: # Attention query MODEL_TENSOR.ATTN_Q: ( "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom "layers.{bid}.attention.wq", # llama-pth "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j @@ -158,6 +159,7 @@ class TensorNameMap: # Attention key MODEL_TENSOR.ATTN_K: ( "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom "layers.{bid}.attention.wk", # llama-pth "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j From 784a14aa496a66cc43e76014a1bf4333b4a2a55a Mon Sep 17 00:00:00 2001 From: Sukriti Sharma Date: Sat, 7 Dec 2024 00:02:14 -0700 Subject: [PATCH 18/99] convert : add support for Roberta embeddings (#10695) --- convert_hf_to_gguf.py | 8 +- convert_hf_to_gguf_update.py | 1 + models/ggml-vocab-roberta-bpe.gguf.inp | 112 +++++++++++++++++++++++++ models/ggml-vocab-roberta-bpe.gguf.out | 46 ++++++++++ 4 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 models/ggml-vocab-roberta-bpe.gguf.inp create mode 100644 models/ggml-vocab-roberta-bpe.gguf.out diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9f1419e29..a4eece934 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -661,6 +661,9 @@ class Model: if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" if res is None: logger.warning("\n") @@ -2533,7 +2536,7 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel") +@Model.register("BertModel", "CamembertModel", "RobertaModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2574,7 +2577,8 @@ class BertModel(Model): # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) # convert to phantom space vocab def phantom(tok): diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index ce3c571df..aa10e5db7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -103,6 +103,7 @@ models = [ {"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", }, {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, + {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, ] diff --git a/models/ggml-vocab-roberta-bpe.gguf.inp b/models/ggml-vocab-roberta-bpe.gguf.inp new file mode 100644 index 000000000..9baf7d77a --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.inp @@ -0,0 +1,112 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +!!!!!! +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ +Cửa Việt +__ggml_vocab_test__ + discards +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-roberta-bpe.gguf.out b/models/ggml-vocab-roberta-bpe.gguf.out new file mode 100644 index 000000000..f181ac3dc --- /dev/null +++ b/models/ggml-vocab-roberta-bpe.gguf.out @@ -0,0 +1,46 @@ + 2550 204 18430 377 + 597 2768 298 8564 + + 1437 + 1437 1437 + 1437 1437 1437 + 50117 + 50118 + 50140 + 50140 50118 + 50117 50118 + 31414 232 + 20920 232 + 31414 623 + 20920 623 + 20920 623 328 + 31414 6 232 328 + 20920 6 232 328 + 42 16 8103 18164 27 4 49317 + 605 40976 262 10109 18474 385 29 36807 6455 + 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 + 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171 + 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43 + 31414 + 20920 + 1437 20920 + 1437 1437 20920 + 1437 1437 1437 20920 + 1437 1437 1437 20920 50118 1437 1437 1437 20920 + 36 + 50118 5457 + 108 3567 + 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 + 32376 12846 + 246 + 3103 + 25631 + 46152 + 3103 25631 + 46152 3103 + 46152 25631 + 46152 46152 + 46152 3103 25631 + 347 1376 2023 12410 102 16376 1376 2023 6382 90 + 9553 5954 + 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574 From 86a1934978ce5daffc7e5b4251f6540ee5e7b47b Mon Sep 17 00:00:00 2001 From: Robert Ormandi <52251610+ormandi@users.noreply.github.com> Date: Sat, 7 Dec 2024 01:55:01 -0600 Subject: [PATCH 19/99] metal : Extend how Llama.cpp locates metal resources (#10676) * metal : Extend how Llama.cpp locates metal resources (#10675) * It searches the resource file in the directory where the current binary is located as well. * Resolves symbolic links. Rationale: When we plug this dependency into a Bazel build and run it in the context of Bazel (e.g. testing): * the execution directory is often very different from where the files are located and no direct control over this (Bazel sandboxing), * the Bazel sandbox often use symbolic links to make files available. With this patch, we can have the resource file added to the target, can build and run tests in the context of Bazel. * Update ggml/src/ggml-metal/ggml-metal.m Co-authored-by: Georgi Gerganov * Update ggml/src/ggml-metal/ggml-metal.m Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- ggml/src/ggml-metal/ggml-metal.m | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index f80fda7a4..34fe5778e 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -510,6 +510,35 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de #endif NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"]; + if (path_lib == nil) { + // Try to find the resource in the directory where the current binary located. + NSString * current_binary = [[NSProcessInfo processInfo] arguments][0]; + NSString * bin_dir = [current_binary stringByDeletingLastPathComponent]; + NSString * default_metallib_path = [NSString pathWithComponents:@[bin_dir, @"default.metallib"]]; + if ([[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + GGML_LOG_INFO("%s: found '%s'\n", __func__, [default_metallib_path UTF8String]); + NSDictionary * atts = [[NSFileManager defaultManager] attributesOfItemAtPath:default_metallib_path error:&error]; + if (atts && atts[NSFileType] == NSFileTypeSymbolicLink) { + // Optionally, if this is a symlink, try to resolve it. + default_metallib_path = [[NSFileManager defaultManager] destinationOfSymbolicLinkAtPath:default_metallib_path error:&error]; + if (default_metallib_path && [default_metallib_path length] > 0 && ![[default_metallib_path substringToIndex:1] isEqualToString:@"/"]) { + // It is a relative path, adding the binary directory as directory prefix. + default_metallib_path = [NSString pathWithComponents:@[bin_dir, default_metallib_path]]; + } + if (!default_metallib_path || ![[NSFileManager defaultManager] isReadableFileAtPath:default_metallib_path]) { + // Link to the resource could not be resolved. + default_metallib_path = nil; + } else { + GGML_LOG_INFO("%s: symlink resolved '%s'\n", __func__, [default_metallib_path UTF8String]); + } + } + } else { + // The resource couldn't be found in the binary's directory. + default_metallib_path = nil; + } + path_lib = default_metallib_path; + } + if (try_metallib && path_lib != nil) { // pre-compiled library found NSURL * libURL = [NSURL fileURLWithPath:path_lib]; From 3df784b3050f657ea681f804187ce5bddb433e88 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Sat, 7 Dec 2024 10:24:15 +0100 Subject: [PATCH 20/99] Vulkan: VK_KHR_cooperative_matrix support to speed up prompt processing (#10597) * Vulkan: Implement VK_KHR_cooperative_matrix support in the matrix matrix multiplication shader * Improve performance with better q4_k and q5_k dequant and store unrolling * Add Vulkan MUL_MAT and MUL_MAT_ID accumulator precision selection * Rework mulmat shader selection and compilation logic, avoid compiling shaders that won't get used by device * Vulkan: Implement accumulator switch for specific mul mat mat shaders * Vulkan: Unroll more loops for more mul mat mat performance * Vulkan: Add VK_AMD_shader_core_properties2 support to read Compute Unit count for split_k logic * Disable coopmat support on AMD proprietary driver * Remove redundant checks * Add environment variable GGML_VK_DISABLE_COOPMAT to disable VK_KHR_cooperative_matrix support * Fix rebase typo * Fix coopmat2 MUL_MAT_ID pipeline selection --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 823 +++++++++++------- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 229 +++-- .../vulkan-shaders/vulkan-shaders-gen.cpp | 73 +- 3 files changed, 739 insertions(+), 386 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c7ac0e8f7..4b2e449a5 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1,7 +1,8 @@ #include "ggml-vulkan.h" #include -#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) +#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS) #include +#include "ggml-cpu.h" #endif #include @@ -169,8 +170,22 @@ struct vk_device_struct { bool uma; bool coopmat2; + bool coopmat_support; + bool coopmat_acc_f32_support; + bool coopmat_acc_f16_support; + uint32_t coopmat_m; + uint32_t coopmat_n; + uint32_t coopmat_k; + size_t idx; + bool mul_mat_l; + bool mul_mat_m; + bool mul_mat_s; + bool mul_mat_id_l; + bool mul_mat_id_m; + bool mul_mat_id_s; + vk_matmul_pipeline pipeline_matmul_f32; vk_matmul_pipeline pipeline_matmul_f32_f16; vk_matmul_pipeline2 pipeline_matmul_f16; @@ -181,10 +196,10 @@ struct vk_device_struct { vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; vk_matmul_pipeline pipeline_matmul_id_f32; - vk_matmul_pipeline pipeline_matmul_id_f16; - vk_matmul_pipeline pipeline_matmul_id_f16_f32; + vk_matmul_pipeline2 pipeline_matmul_id_f16; + vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; - vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT]; @@ -1325,6 +1340,18 @@ static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ return {64, 64}; }; +static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id) { + // Needs to be kept up to date on shader changes + const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; + const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + const uint32_t warps = warptile[0] / device->subgroup_size; + + const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; + const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0; + const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0; + + return (load_bufs + mmid_row_ids + coopmat_stage) <= device->properties.limits.maxComputeSharedMemorySize; +} static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1382,12 +1409,25 @@ static void ggml_vk_load_shaders(vk_device& device) { m_align = 64; s_align = 32; } else { - l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, device->subgroup_size }; - l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; - m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, 2, 2, device->subgroup_size }; + // Matrix cores require different warp group sizes + const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_m = device->coopmat_support ? device->coopmat_m : 4; + const uint32_t tm_s = device->coopmat_support ? device->coopmat_m : 2; + const uint32_t tn_l = device->coopmat_support ? device->coopmat_n : 4; + const uint32_t tn_m = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tn_s = device->coopmat_support ? device->coopmat_n : 2; + const uint32_t tk_l = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; + const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; + + l_warptile = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; + m_warptile = { 128, 64, 64, 16, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; + s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + + l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; + m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; @@ -1428,25 +1468,36 @@ static void ggml_vk_load_shaders(vk_device& device) { // assert mul_mat_mat_id shaders will fit. GGML_ASSERT(shmem_needed + 3072*4 <= device->properties.limits.maxComputeSharedMemorySize); } + // Disable medium and large matrix multiplication if not enough shared memory is available + // Check mmq warptiles as the largest configuration + // Throw an error if not enough for any matrix multiplication is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, false)) { + std::cerr << "ggml_vulkan: Error: Shared memory size too small for matrix multiplication." << std::endl; + throw std::runtime_error("Shared memory size too small for matrix multiplication."); + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, false)) { + device->mul_mat_m = false; + device->mul_mat_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, false)) { + device->mul_mat_l = false; + } + + // Disable mul_mat_id if not enough shared memory is available + if (!ggml_vk_matmul_shmem_support(device, s_warptile_mmq, true)) { + device->mul_mat_id_s = false; + device->mul_mat_id_m = false; + device->mul_mat_id_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, m_warptile_mmq, true)) { + device->mul_mat_id_m = false; + device->mul_mat_id_l = false; + } else if (!ggml_vk_matmul_shmem_support(device, l_warptile_mmq, true)) { + device->mul_mat_id_l = false; + } } device->pipeline_matmul_f32 = std::make_shared(); device->pipeline_matmul_f32_f16 = std::make_shared(); device->pipeline_matmul_id_f32 = std::make_shared(); - device->pipeline_matmul_id_f16_f32 = std::make_shared(); - device->pipeline_matmul_id_f16 = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared(); - device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared(); std::vector> compiles; auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false) { @@ -1543,119 +1594,191 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3) CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4) #undef CREATE_MM #undef CREATE_MM2 } else -#endif - if (device->fp16) { +#endif // defined(VK_NV_cooperative_matrix2) + if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ - CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); + // Create 2 variants, {f16,f32} accumulator +#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + } +#undef CREATE_MM + } else if (device->fp16) { + // Create 6 variants, {s,m,l}x{unaligned,aligned} +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + + // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); } #undef CREATE_MM } else { // Create 6 variants, {s,m,l}x{unaligned,aligned} -#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ +#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + if (device->mul_mat ## ID ## _l) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + if (device->mul_mat ## ID ## _m) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + if (device->mul_mat ## ID ## _s) \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ - CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3); + CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f16.f32acc, matmul_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_matmul_f16_f32.f32acc, matmul_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3, ); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f32acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f32acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f32acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f32acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f32acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); - CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f32acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f32acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f32acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f32acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f32acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); + CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f32acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, ); // If there's not enough shared memory for row_ids and the result tile, don't create these pipelines. - if (device->properties.limits.maxComputeSharedMemorySize >= 32768) { - CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); - CREATE_MM(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4); + if (device->mul_mat_id_s || device->mul_mat_id_m || device->mul_mat_id_l) { + CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(pipeline_matmul_id_f16.f32acc, matmul_id_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); + CREATE_MM(pipeline_matmul_id_f16_f32.f32acc, matmul_id_f16_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0], matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1], matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0], matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1], matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0], matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f32acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f32acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f32acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f32acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f32acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K], matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K], matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K], matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K], matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K], matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); - CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL], matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f32acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f32acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f32acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f32acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f32acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); + CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f32acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id); } +#undef CREATE_MM2 #undef CREATE_MM } @@ -1851,8 +1974,10 @@ static vk_device ggml_vk_get_device(size_t idx) { bool fp16_compute = false; bool maintenance4_support = false; bool sm_builtins = false; + bool amd_shader_core_properties2 = false; bool pipeline_robustness = false; bool coopmat2_support = false; + device->coopmat_support = false; // Check if maintenance4 is supported for (const auto& properties : ext_props) { @@ -1864,10 +1989,18 @@ static vk_device ggml_vk_get_device(size_t idx) { fp16_compute = true; } else if (strcmp("VK_NV_shader_sm_builtins", properties.extensionName) == 0) { sm_builtins = true; + } else if (strcmp("VK_AMD_shader_core_properties2", properties.extensionName) == 0) { + amd_shader_core_properties2 = true; } else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) { pipeline_robustness = true; + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { + device->coopmat_support = true; + device->coopmat_m = 0; + device->coopmat_n = 0; + device->coopmat_k = 0; } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && - !getenv("GGML_VULKAN_DISABLE_COOPMAT2")) { + !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; } } @@ -1876,11 +2009,14 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceMaintenance3Properties props3; vk::PhysicalDeviceMaintenance4Properties props4; vk::PhysicalDeviceSubgroupProperties subgroup_props; + vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; + vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; props2.pNext = &props3; props3.pNext = &subgroup_props; + subgroup_props.pNext = &driver_props; - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&subgroup_props; + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; @@ -1890,6 +2026,10 @@ static vk_device ggml_vk_get_device(size_t idx) { last_struct->pNext = (VkBaseOutStructure *)&sm_props; last_struct = (VkBaseOutStructure *)&sm_props; } + if (amd_shader_core_properties2) { + last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props; + } #if defined(VK_NV_cooperative_matrix2) vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props; @@ -1905,7 +2045,7 @@ static vk_device ggml_vk_get_device(size_t idx) { const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE"); if (GGML_VK_FORCE_MAX_ALLOCATION_SIZE != nullptr) { - device->max_memory_allocation_size = std::stoi(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); + device->max_memory_allocation_size = std::stoul(GGML_VK_FORCE_MAX_ALLOCATION_SIZE); } else if (maintenance4_support) { device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); } else { @@ -1917,15 +2057,22 @@ static vk_device ggml_vk_get_device(size_t idx) { device->uma = device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; if (sm_builtins) { device->shader_core_count = sm_props.shaderSMCount; + } else if (amd_shader_core_properties2) { + device->shader_core_count = amd_shader_core_properties2_props.activeComputeUnitCount; } else { device->shader_core_count = 0; } - const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); - const bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; + const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { + // Intel drivers don't support coopmat properly yet + // Only RADV supports coopmat properly on AMD + device->coopmat_support = false; + } + std::vector queue_family_props = device->physical_device.getQueueFamilyProperties(); // Try to find a non-graphics compute queue and transfer-focused queues @@ -1976,6 +2123,16 @@ static vk_device ggml_vk_get_device(size_t idx) { device_extensions.push_back("VK_EXT_pipeline_robustness"); } + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (device->coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } + #if defined(VK_NV_cooperative_matrix2) VkPhysicalDeviceCooperativeMatrix2FeaturesNV coopmat2_features {}; coopmat2_features.pNext = nullptr; @@ -1993,6 +2150,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->pipeline_robustness = pl_robustness_features.pipelineRobustness; + device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; + if (coopmat2_support) { #if defined(VK_NV_cooperative_matrix2) if (coopmat2_features.cooperativeMatrixWorkgroupScope && @@ -2083,6 +2242,74 @@ static vk_device ggml_vk_get_device(size_t idx) { if (device->fp16) { device_extensions.push_back("VK_KHR_shader_float16_int8"); } + + if (device->coopmat_support) { + // Query supported shapes + std::vector cm_props; + + PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = + (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(vk_instance.instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); + + uint32_t cm_props_num; + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, nullptr); + + cm_props.resize(cm_props_num); + + for (auto& prop : cm_props) { + prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + } + + pfn_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(device->physical_device, &cm_props_num, cm_props.data()); + + VK_LOG_DEBUG("ggml_vulkan: Cooperative Matrix Shapes: " << cm_props.size()); + + for (auto& prop : cm_props) { + VK_LOG_DEBUG("ggml_vulkan: M: " << prop.MSize << " N: " << prop.NSize << " K: " << prop.KSize << " A: " << vk::to_string((vk::ComponentTypeKHR)prop.AType) << " B: " << vk::to_string((vk::ComponentTypeKHR)prop.BType) << " C: " << vk::to_string((vk::ComponentTypeKHR)prop.CType) << " Result: " << vk::to_string((vk::ComponentTypeKHR)prop.ResultType) << " saturatingAccumulation: " << prop.saturatingAccumulation << " scope: " << vk::to_string((vk::ScopeKHR)prop.scope)); + + if ((vk::ComponentTypeKHR)prop.AType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.BType == vk::ComponentTypeKHR::eFloat16 && + (vk::ScopeKHR)prop.scope == vk::ScopeKHR::eSubgroup + ) { + if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat32 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat32) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f32_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f32_support = true; + } + } else if ((vk::ComponentTypeKHR)prop.CType == vk::ComponentTypeKHR::eFloat16 && + (vk::ComponentTypeKHR)prop.ResultType == vk::ComponentTypeKHR::eFloat16) { + // coopmat sizes not set yet + if (device->coopmat_m == 0) { + device->coopmat_acc_f16_support = true; + device->coopmat_m = prop.MSize; + device->coopmat_n = prop.NSize; + device->coopmat_k = prop.KSize; + } else if (device->coopmat_m == prop.MSize && device->coopmat_n == prop.NSize && device->coopmat_k == prop.KSize) { + // Only enable if shape is identical + device->coopmat_acc_f16_support = true; + } + } + } + } + + if (device->coopmat_m == 0) { + // No suitable matmul mode found + GGML_LOG_DEBUG("ggml_vulkan: WARNING: No suitable matrix core mode found. Disabling matrix cores.\n"); + device->coopmat_support = false; + } + } + + if (device->coopmat_support) { + device_extensions.push_back("VK_KHR_cooperative_matrix"); + } + device->name = GGML_VK_NAME + std::to_string(idx); device_create_info = { @@ -2098,6 +2325,37 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_create_queue(device, device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }, false); // Shaders + // Disable matmul tile sizes early if performance low or not supported + switch (device->vendor_id) { +#ifndef GGML_VULKAN_RUN_TESTS + case VK_VENDOR_ID_AMD: + case VK_VENDOR_ID_INTEL: + device->mul_mat_l = false; + device->mul_mat_m = true; + device->mul_mat_s = true; + device->mul_mat_id_l = false; + device->mul_mat_id_m = true; + device->mul_mat_id_s = true; + break; + case VK_VENDOR_ID_APPLE: + device->mul_mat_l = false; + device->mul_mat_m = true; + device->mul_mat_s = false; + device->mul_mat_id_l = false; + device->mul_mat_id_m = true; + device->mul_mat_id_s = false; + break; +#endif + default: + device->mul_mat_l = true; + device->mul_mat_m = true; + device->mul_mat_s = true; + device->mul_mat_id_l = true; + device->mul_mat_id_m = true; + device->mul_mat_id_s = true; + break; + } + ggml_vk_load_shaders(device); if (!device->single_queue) { @@ -2155,15 +2413,24 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; + bool coopmat_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { + coopmat_support = true; } } + if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) { + // Intel drivers don't support coopmat properly yet + // Only RADV supports coopmat properly on AMD + coopmat_support = false; + } + const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16"); bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr; @@ -2186,13 +2453,28 @@ static void ggml_vk_print_gpu_info(size_t idx) { vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; vk11_features.pNext = &vk12_features; + // Pointer to the last chain element + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_features; + + VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features; + coopmat_features.pNext = nullptr; + coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; + coopmat_features.cooperativeMatrix = VK_FALSE; + + if (coopmat_support) { + last_struct->pNext = (VkBaseOutStructure *)&coopmat_features; + last_struct = (VkBaseOutStructure *)&coopmat_features; + } + vkGetPhysicalDeviceFeatures2(physical_device, &device_features2); fp16 = fp16 && vk12_features.shaderFloat16; + coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix; + std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %d\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, coopmat_support); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); @@ -2428,7 +2710,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) { return ctx->device->pipeline_matmul_f32_f16; } - if (prec == GGML_PREC_DEFAULT && ctx->device->coopmat2) { + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_f16_f32.f16acc; } @@ -2469,7 +2751,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte assert(src1_type == GGML_TYPE_F16); return ctx->device->pipeline_dequant_mul_mat_mat_f16[src0_type].f16acc; } - return ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; + return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat[src0_type].f32acc; } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { @@ -2498,16 +2780,25 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type]; } -static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type, ggml_prec prec) { VK_LOG_DEBUG("ggml_vk_get_mul_mat_mat_id_pipeline()"); if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return ctx->device->pipeline_matmul_id_f32; } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { - return ctx->device->pipeline_matmul_id_f16_f32; - } - if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { - return ctx->device->pipeline_matmul_id_f16; + if (prec == GGML_PREC_DEFAULT && ctx->device->fp16) { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f16acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f16acc; + } + } else { + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32.f32acc; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16.f32acc; + } } GGML_ASSERT(src1_type == GGML_TYPE_F32); @@ -2529,7 +2820,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co return nullptr; } - return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; + return ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f16acc : ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type].f32acc; } static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec_id(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { @@ -3119,62 +3410,31 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int return split_k; } -static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { - if (m <= 32 || n <= 32) { - return aligned ? mmp->a_s : mmp->s; - } - return aligned ? mmp->a_m : mmp->m; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { - return aligned ? mmp->a_m : mmp->m; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, bool aligned) { - return aligned ? mmp->a_s : mmp->s; - - GGML_UNUSED(ctx); -} - -static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type type_a) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); - switch (ctx->device->vendor_id) { - case VK_VENDOR_ID_AMD: - return ggml_vk_guess_matmul_pipeline_amd(ctx, mmp, m, n, aligned); - case VK_VENDOR_ID_APPLE: - return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned); - case VK_VENDOR_ID_INTEL: - return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned); - default: - break; - } if (ctx->device->coopmat2) { - if ((m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) { + if ((ctx->device->mul_mat_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_s)) { return aligned ? mmp->a_l : mmp->l; } - if ((m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) { + if ((ctx->device->mul_mat_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; } - if (m <= 32 || n <= 32) { + if ((ctx->device->mul_mat_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_m && !ctx->device->mul_mat_l)) { return aligned ? mmp->a_s : mmp->s; } - if (m <= 64 || n <= 64) { + if ((ctx->device->mul_mat_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_l) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_l : mmp->l; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type type_a) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, type_a)->align; } static void ggml_vk_matmul( @@ -3201,6 +3461,33 @@ static void ggml_vk_matmul( ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); } +static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); + + if (ctx->device->coopmat2) { + if ((ctx->device->mul_mat_id_l && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_s)) { + return aligned ? mmp->a_l : mmp->l; + } + if ((ctx->device->mul_mat_id_m && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_s : mmp->s; + } + + if ((ctx->device->mul_mat_id_s && (m <= 32 || n <= 32)) || (!ctx->device->mul_mat_id_m && !ctx->device->mul_mat_id_l)) { + return aligned ? mmp->a_s : mmp->s; + } + if ((ctx->device->mul_mat_id_m && (m <= 64 || n <= 64)) || !ctx->device->mul_mat_id_l) { + return aligned ? mmp->a_m : mmp->m; + } + return aligned ? mmp->a_l : mmp->l; +} + +static uint32_t ggml_vk_guess_matmul_id_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { + VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); + return ggml_vk_guess_matmul_id_pipeline(ctx, mmp, m, n, true)->align; +} + static void ggml_vk_matmul_id( ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, @@ -3350,10 +3637,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, src0->type)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, src0->type); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); @@ -3904,7 +4191,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; - vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type, (ggml_prec)dst->op_params[0]); const bool qx_needs_dequant = mmp == nullptr || x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; @@ -3920,10 +4207,10 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne21 * ne20; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, nei1)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1)); const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, nei1, aligned); + vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); @@ -5504,19 +5791,27 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t for (size_t i = 0; i < x_ne; i++) { if (std::is_same()) { x[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // x[i] = 1.0f; + // x[i] = i + 1; + // x[i] = (i % k == i / k) ? 1.0f : 0.0f; } else if (std::is_same()) { x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // x[i] = ggml_fp32_to_fp16(1.0f); + // x[i] = ggml_fp32_to_fp16(i + 1); + // x[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); } else { GGML_ABORT("fatal error"); } } for (size_t i = 0; i < y_ne; i++) { if (std::is_same()) { - // y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; - y[i] = (i % k == i / k) ? 1.0f : 0.0f; + y[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; + // y[i] = (i % k == i / k) ? 1.0f : 0.0f; + // y[i] = i + 1; } else if (std::is_same()) { - // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); - y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); + // y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); + // y[i] = ggml_fp32_to_fp16(i + 1); } else { GGML_ABORT("fatal error"); } @@ -5600,7 +5895,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t double err = std::fabs(d[i] - d_chk[i]); avg_err += err; - if (err > 0.05f && first_err_n == -1) { + if ((err > 0.05f || std::isnan(err)) && first_err_n == -1) { first_err_b = i / (m * n); first_err_n = (i % (m * n)) / m; first_err_m = (i % (m * n)) % m; @@ -5613,12 +5908,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t std::cerr << "TEST " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time / num_it << "ms " << tflops << " TFLOPS avg_err=" << avg_err << std::endl; - if (avg_err > 0.1) { + if (avg_err > 0.1 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); - std::cerr << std::endl; - ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n + 15, first_err_b); std::cerr << "Expected result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d_chk, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); @@ -5801,13 +6094,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, vk_pipeline p; std::string shname; if (shader_size == 0) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_s; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; } else if (shader_size == 1) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_m; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_M"; } else if (shader_size == 2) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->a_l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->a_l; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_L"; } else { GGML_ASSERT(0); @@ -5817,13 +6110,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, if (k != kpad) { if (shader_size == 0) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->s : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->s; shname = std::string(ggml_type_name(quant)) + "_S"; } else if (shader_size == 1) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->m : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->m; shname = std::string(ggml_type_name(quant)) + "_M"; } else if (shader_size == 2) { - p = ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l; + p = ctx->device->fp16 ? ctx->device->pipeline_dequant_mul_mat_mat[quant].f16acc->l : ctx->device->pipeline_dequant_mul_mat_mat[quant].f32acc->l; shname = std::string(ggml_type_name(quant)) + "_L"; } else { GGML_ASSERT(0); @@ -5982,105 +6275,13 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { #if defined(GGML_VULKAN_RUN_TESTS) - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_F32); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_1); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_1); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q8_0); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q2_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q3_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q4_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K); - ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_IQ4_NL); - - ggml_vk_test_matmul(ctx, 512, 512, 100, 32, 100, 1, 2); - - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 0); - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 1); - ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 2); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 0); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 1); - // ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 4, 2); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_1); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_1); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_1); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_1); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q8_0); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K); - // ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K); - - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_IQ4_NL); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_IQ4_NL); - ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_IQ4_NL); - - std::cerr << std::endl; - const std::vector vals { + 512, 512, 128, + 128, 512, 512, + 4096, 512, 4096, + 11008, 512, 4096, + 4096, 512, 11008, + 32000, 512, 4096, 8, 8, 8, 100, 46, 576, 623, 111, 128, @@ -6093,15 +6294,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { 49, 49, 128, 128, 49, 49, 4096, 49, 4096, - 11008, 49, 4096, - 4096, 49, 11008, - 32000, 49, 4096, - 512, 512, 128, - 128, 512, 512, - 4096, 512, 4096, - 11008, 512, 4096, - 4096, 512, 11008, - 32000, 512, 4096, }; const size_t num_it = 100; @@ -6109,10 +6301,45 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1); ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); - // ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); - std::cerr << std::endl; + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2); + std::cerr << '\n'; + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1); + ggml_vk_test_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2); + std::cerr << '\n' << std::endl; + + if (vals[i + 2] % 32 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_0); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_0); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_0); + std::cerr << '\n' << std::endl; + } + + if (vals[i + 2] % 256 == 0) { + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 1, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 2, 2, GGML_TYPE_Q4_K); + std::cerr << '\n'; + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, vals[i], vals[i + 1], vals[i + 2], 2, num_it, 4, 2, GGML_TYPE_Q4_K); + std::cerr << '\n' << std::endl; + } } GGML_ABORT("fatal error"); @@ -7200,8 +7427,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_MUL_MAT_ID: { ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; - if (op->op == GGML_OP_MUL_MAT_ID && - ggml_vk_get_device(ctx->device)->properties.limits.maxComputeSharedMemorySize < 32768) { + const vk_device& device = ggml_vk_get_device(ctx->device); + if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s && !device->mul_mat_id_m && !device->mul_mat_id_l) { // If there's not enough shared memory for row_ids and the result tile, fallback to CPU return false; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 2ff5c4305..48122cbef 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -7,6 +7,12 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #endif +#ifdef COOPMAT +#extension GL_KHR_cooperative_matrix : enable +#extension GL_KHR_memory_scope_semantics : enable +#extension GL_KHR_shader_subgroup_basic : enable +#endif + #ifdef MUL_MAT_ID #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #endif @@ -57,6 +63,7 @@ layout (push_constant) uniform parameter #endif } p; +layout (constant_id = 0) const uint BLOCK_SIZE = 64; layout (constant_id = 1) const uint BM = 64; layout (constant_id = 2) const uint BN = 64; layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant @@ -65,13 +72,26 @@ layout (constant_id = 5) const uint WN = 32; layout (constant_id = 6) const uint WMITER = 2; layout (constant_id = 7) const uint TM = 4; layout (constant_id = 8) const uint TN = 2; -layout (constant_id = 9) const uint WARP = 32; +layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat +layout (constant_id = 10) const uint WARP = 32; -shared FLOAT_TYPE buf_a[BM * (BK+1)]; -shared FLOAT_TYPE buf_b[BN * (BK+1)]; +#ifdef COOPMAT +#define SHMEM_STRIDE (BK + 8) +#else +#define SHMEM_STRIDE (BK + 1) +#endif + +shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE]; +shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; #ifdef MUL_MAT_ID shared u16vec2 row_ids[3072]; +#endif // MUL_MAT_ID + +#define NUM_WARPS (BLOCK_SIZE / WARP) + +#ifdef COOPMAT +shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; #endif void main() { @@ -98,17 +118,32 @@ void main() { const uint ik = gl_WorkGroupID.x / blocks_m; const uint ic = gl_WorkGroupID.y; - const uint warp_i = gl_LocalInvocationID.x / WARP; - const uint warp_r = warp_i % (BM / WM); - const uint warp_c = warp_i / (BM / WM); - const uint WNITER = (WM * WN) / (WARP * TM * TN * WMITER); const uint WSUBM = WM / WMITER; const uint WSUBN = WN / WNITER; +#ifdef COOPMAT + const uint warp_i = gl_SubgroupID; + + const uint tiw = gl_SubgroupInvocationID; + + const uint cms_per_row = WM / TM; + const uint cms_per_col = WN / TN; + + const uint storestride = WARP / TM; + const uint store_r = tiw % TM; + const uint store_c = tiw / TM; +#else + const uint warp_i = gl_LocalInvocationID.x / WARP; + const uint tiw = gl_LocalInvocationID.x % WARP; + const uint tiwr = tiw % (WSUBM / TM); const uint tiwc = tiw / (WSUBM / TM); +#endif + + const uint warp_r = warp_i % (BM / WM); + const uint warp_c = warp_i / (BM / WM); const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A); const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A); @@ -156,21 +191,31 @@ void main() { uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; #endif - float sums[WMITER * TM * WNITER * TN]; +#ifdef COOPMAT + coopmat cache_a; + coopmat cache_b; + coopmat sums[cms_per_row * cms_per_col]; + + [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) { + sums[i] = coopmat(0.0f); + } +#else + ACC_TYPE sums[WMITER * TM * WNITER * TN]; FLOAT_TYPE cache_a[WMITER * TM]; FLOAT_TYPE cache_b[WNITER * TN]; [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) { - sums[i] = 0.0f; + sums[i] = ACC_TYPE(0.0f); } +#endif - [[unroll]] for (uint block = start_k; block < end_k; block += BK) { + for (uint block = start_k; block < end_k; block += BK) { [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) { #if defined(DATA_A_F32) || defined(DATA_A_F16) #if LOAD_VEC_A == 8 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x); buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y); buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z); @@ -181,21 +226,21 @@ void main() { buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w); #elif LOAD_VEC_A == 4 const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x); buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y); buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z); buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w); #else if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { - buf_a[(loadc_a + l) * (BK+1) + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); } else { - buf_a[(loadc_a + l) * (BK+1) + loadr_a] = FLOAT_TYPE(0.0f); + buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f); } #endif #elif defined(DATA_A_Q4_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -208,7 +253,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q4_1) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -222,7 +267,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q5_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -237,7 +282,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q5_1) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -253,7 +298,7 @@ void main() { buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q8_0) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 16; const uint iqs = (idx & 0xF) * 2; @@ -265,7 +310,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q2_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -284,7 +329,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); #elif defined(DATA_A_Q3_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -308,7 +353,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); #elif defined(DATA_A_Q4_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -320,15 +365,20 @@ void main() { const vec2 loadd = vec2(data_a[ib].d); - uint8_t sc; - uint8_t mbyte; - if (is < 4) { - sc = uint8_t(data_a[ib].scales[is ] & 63); - mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); - } else { - sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); - mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); - } + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + const float d = loadd.x * sc; const float m = -loadd.y * mbyte; @@ -336,7 +386,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); #elif defined(DATA_A_Q5_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -351,15 +401,20 @@ void main() { const vec2 loadd = vec2(data_a[ib].d); - uint8_t sc; - uint8_t mbyte; - if (is < 4) { - sc = uint8_t(data_a[ib].scales[is ] & 63); - mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); - } else { - sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); - mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); - } + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + const float d = loadd.x * sc; const float m = -loadd.y * mbyte; @@ -367,7 +422,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); #elif defined(DATA_A_Q6_K) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; const uint ib = idx / 128; // 2 values per idx const uint iqs = idx % 128; // 0..127 @@ -386,7 +441,7 @@ void main() { buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); #elif defined(DATA_A_IQ4_NL) const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a; + const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a; const uint ib = idx / 16; const uint iqs = idx & 0xF; @@ -407,7 +462,7 @@ void main() { #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; #endif - const uint buf_idx = (loadc_b + l) * (BK+1) + loadr_b * LOAD_VEC_B; + const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x); buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y); buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z); @@ -423,24 +478,24 @@ void main() { #else const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; #endif - const uint buf_idx = (loadc_b + l) * (BK+1) + loadr_b * LOAD_VEC_B; + const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x); buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y); buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z); buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w); #elif !MUL_MAT_ID if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); } else { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(0.0f); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } #else const uint row_i = ic * BN + loadc_b + l; if (row_i < _ne1) { const u16vec2 row_idx = row_ids[row_i]; - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); } else { - buf_b[(loadc_b + l) * (BK+1) + loadr_b] = FLOAT_TYPE(0.0f); + buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); } #endif } @@ -450,16 +505,30 @@ void main() { pos_a += BK / LOAD_VEC_A; pos_b += BK / LOAD_VEC_B; - for (uint i = 0; i < BK; i++) { +#ifdef COOPMAT + [[unroll]] for (uint i = 0; i < BK; i += TK) { + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + // Load from shared into cache + coopMatLoad(cache_a, buf_a, (warp_r * WM + cm_row * TM) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutRowMajor); + + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopMatLoad(cache_b, buf_b, (warp_c * WN + cm_col * TN) * SHMEM_STRIDE + i, SHMEM_STRIDE, gl_CooperativeMatrixLayoutColumnMajor); + + sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a, cache_b, sums[cm_col * cms_per_row + cm_row]); + } + } + } +#else + [[unroll]] for (uint i = 0; i < BK; i++) { // Load from shared into cache [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { [[unroll]] for (uint j = 0; j < TM; j++) { - cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * (BK+1) + i]; + cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i]; } } [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint j = 0; j < TN; j++) { - cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * (BK+1) + i]; + cache_b[wsic * TN + j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i]; } } @@ -468,12 +537,13 @@ void main() { [[unroll]] for (uint cc = 0; cc < TN; cc++) { [[unroll]] for (uint cr = 0; cr < TM; cr++) { const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr; - sums[sums_idx] = fma(float(cache_a[wsir * TM + cr]), float(cache_b[wsic * TN + cc]), sums[sums_idx]); + sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[wsic * TN + cc]), sums[sums_idx]); } } } } } +#endif barrier(); } @@ -485,6 +555,54 @@ void main() { const uint offsets = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; #endif +#ifdef COOPMAT +#ifdef MUL_MAT_ID + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < BN; col += storestride) { + const uint row_i = dc + cm_col * TN + col + store_c; + if (row_i >= _ne1) break; + + const u16vec2 row_idx = row_ids[row_i]; + + data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } + } +#else + const bool is_aligned = p.stride_d % 4 == 0; // Assumption: D_TYPE == float + + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + const bool is_in_bounds = dr + (cm_row + 1) * TM <= p.M && dc + (cm_col + 1) * TN <= p.N; + + if (is_aligned && is_in_bounds) { + // Full coopMat is within bounds and stride_d is aligned with 16B + coopmat cm_dtype = coopmat(sums[cm_col * cms_per_row + cm_row]); + coopMatStore(cm_dtype, data_d, offsets + (dc + cm_col * TN) * p.stride_d + dr + cm_row * TM, p.stride_d, gl_CooperativeMatrixLayoutColumnMajor); + } else if (is_in_bounds) { + // Full coopMat is within bounds, but stride_d is not aligned + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < TN; col += storestride) { + data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } else if (dr + cm_row * TM < p.M && dc + cm_col * TN < p.N) { + // Partial coopMat is within bounds + coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor); + + [[unroll]] for (uint col = 0; col < TN; col += storestride) { + if (dr + cm_row * TM + store_r < p.M && dc + cm_col * TN + col + store_c < p.N) { + data_d[offsets + (dc + cm_col * TN + col + store_c) * p.stride_d + dr + cm_row * TM + store_r] = D_TYPE(coopmat_stage[warp_i * TM * TN + (col + store_c) * TM + store_r]); + } + } + } + } + } +#endif // MUL_MAT_ID +#else [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { @@ -496,7 +614,7 @@ void main() { if (row_i >= _ne1) break; const u16vec2 row_idx = row_ids[row_i]; -#endif +#endif // MUL_MAT_ID [[unroll]] for (uint cr = 0; cr < TM; cr++) { #ifdef MUL_MAT_ID data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); @@ -504,9 +622,10 @@ void main() { if (dr_warp + cr < p.M && dc_warp + cc < p.N) { data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); } -#endif +#endif // MUL_MAT_ID } } } } +#endif // COOPMAT } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 4716e2c80..c4967ed65 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -60,6 +60,7 @@ const std::vector type_names = { "iq4_nl" }; +namespace { void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) { #ifdef _WIN32 HANDLE stdout_read, stdout_write; @@ -198,8 +199,8 @@ static uint32_t compile_count = 0; static std::mutex compile_count_mutex; static std::condition_variable compile_count_cond; -void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { - std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); +void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { + std::string name = _name + (f16acc ? "_f16acc" : "") + (coopmat ? "_coopmat" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")); std::string out_fname = join_paths(output_dir, name + ".spv"); std::string in_path = join_paths(input_dir, in_fname); @@ -258,7 +259,7 @@ std::map merge_maps(const std::map> compiles; -void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat2 = false, bool f16acc = false) { +void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false) { { // wait until fewer than N compiles are in progress. // 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors. @@ -269,10 +270,10 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const } compile_count++; } - compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat2, f16acc)); + compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16, coopmat, coopmat2, f16acc)); } -void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { +void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) { std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; @@ -291,14 +292,20 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + if (coopmat) { + base_dict["COOPMAT"] = "1"; + } + + base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); for (const auto& tname : type_names) { std::string data_a_key = "DATA_A_" + to_uppercase(tname); @@ -307,12 +314,12 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat2, bool f16acc) { // For aligned matmul loads std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2"; - string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); if (tname != "f16" && tname != "f32") { - string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); } } } @@ -322,25 +329,24 @@ void process_shaders() { std::map base_dict = {{"FLOAT_TYPE", "float"}}; // matmul - for (const auto& fp16 : {false, true}) { - for (const auto& matmul_id : {false, true}) { - for (const auto& coopmat2 : {false, true}) { - for (const auto& f16acc : {false, true}) { -#if !defined(VK_NV_cooperative_matrix2) - if (coopmat2) { - continue; - } + for (const auto& matmul_id : {false, true}) { + // No coopmats + // fp32 + matmul_shaders(false, matmul_id, false, false, false); + + // fp16, fp32acc and fp16acc + matmul_shaders(true, matmul_id, false, false, false); + matmul_shaders(true, matmul_id, false, false, true); + + // Coopmat, fp32acc and fp16acc + matmul_shaders(true, matmul_id, true, false, false); + matmul_shaders(true, matmul_id, true, false, true); + +#if defined(VK_NV_cooperative_matrix2) + // Coopmat2, fp32acc and fp16acc + matmul_shaders(true, matmul_id, false, true, false); + matmul_shaders(true, matmul_id, false, true, true); #endif - if (coopmat2 && !fp16) { - continue; - } - if (!coopmat2 && f16acc) { - continue; - } - matmul_shaders(fp16, matmul_id, coopmat2, f16acc); - } - } - } } #if defined(VK_NV_cooperative_matrix2) @@ -355,11 +361,11 @@ void process_shaders() { if (tname == "f16") { string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", - merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, true, f16acc); + merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc); } else { std::string data_a_key = "DATA_A_" + to_uppercase(tname); string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp", - merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, true, f16acc); + merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc); } } } @@ -524,6 +530,7 @@ void write_output_files() { fclose(hdr); fclose(src); } +} int main(int argc, char** argv) { std::map args; From c2a16c0bdbe2e51adf318918bad82f0c3e3d6f3b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 11:52:44 +0200 Subject: [PATCH 21/99] server : fix free of spec context and batch (#10651) ggml-ci --- common/speculative.cpp | 4 ++++ examples/server/server.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index e559675c4..3fcbb0020 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -62,6 +62,10 @@ struct common_speculative * common_speculative_init( } void common_speculative_free(struct common_speculative * spec) { + if (spec == nullptr) { + return; + } + common_sampler_free(spec->smpl); llama_batch_free(spec->batch); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 809fafa18..d57a296a2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -720,7 +720,7 @@ struct server_slot { int id; int id_task = -1; - llama_batch batch_spec; + llama_batch batch_spec = {}; llama_context * ctx_dft = nullptr; From 19d8762ab61df8286367588a80b9c7db4cb568db Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sat, 7 Dec 2024 13:37:50 +0100 Subject: [PATCH 22/99] ggml : refactor online repacking (#10446) * rename ggml-cpu-aarch64.c to .cpp * reformat extra cpu backend. - clean Q4_0_N_M and IQ4_0_N_M - remove from "file" tensor type - allow only with dynamic repack - extract cpu extra bufts and convert to C++ - hbm - "aarch64" - more generic use of extra buffer - generalise extra_supports_op - new API for "cpu-accel": - amx - aarch64 * clang-format * Clean Q4_0_N_M ref Enable restrict on C++ * add op GGML_OP_MUL_MAT_ID for Q4_0_N_M with runtime repack * added/corrected control on tensor size for Q4 repacking. * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov * Update ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp Co-authored-by: Georgi Gerganov * add debug logs on repacks. --------- Co-authored-by: Georgi Gerganov --- Makefile | 22 +- Package.swift | 5 +- docs/build.md | 2 +- examples/quantize/README.md | 2 - examples/quantize/quantize.cpp | 3 - ggml/include/ggml-cpu.h | 17 - ggml/include/ggml.h | 29 +- ggml/src/CMakeLists.txt | 4 +- ggml/src/ggml-aarch64.c | 129 ---- ggml/src/ggml-aarch64.h | 19 - ggml/src/ggml-cann/ggml-cann.cpp | 2 +- ggml/src/ggml-common.h | 88 +-- ggml/src/ggml-cpu/CMakeLists.txt | 6 +- ggml/src/ggml-cpu/amx/amx.cpp | 162 ++-- ggml/src/ggml-cpu/amx/amx.h | 14 +- ggml/src/ggml-cpu/amx/common.h | 19 +- ggml/src/ggml-cpu/amx/mmq.cpp | 17 +- ggml/src/ggml-cpu/amx/mmq.h | 8 +- ...gml-cpu-aarch64.c => ggml-cpu-aarch64.cpp} | 723 ++++++++++++++---- ggml/src/ggml-cpu/ggml-cpu-aarch64.h | 28 +- ggml/src/ggml-cpu/ggml-cpu-hbm.cpp | 55 ++ ggml/src/ggml-cpu/ggml-cpu-hbm.h | 8 + ggml/src/ggml-cpu/ggml-cpu-traits.cpp | 36 + ggml/src/ggml-cpu/ggml-cpu-traits.h | 38 + ggml/src/ggml-cpu/ggml-cpu.c | 436 ++++------- ggml/src/ggml-cpu/ggml-cpu.cpp | 172 +---- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- ggml/src/ggml-quants.c | 9 - ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml.c | 86 ++- gguf-py/gguf/constants.py | 12 +- include/llama.h | 6 +- src/llama.cpp | 24 +- 33 files changed, 1136 insertions(+), 1049 deletions(-) delete mode 100644 ggml/src/ggml-aarch64.c delete mode 100644 ggml/src/ggml-aarch64.h rename ggml/src/ggml-cpu/{ggml-cpu-aarch64.c => ggml-cpu-aarch64.cpp} (85%) create mode 100644 ggml/src/ggml-cpu/ggml-cpu-hbm.cpp create mode 100644 ggml/src/ggml-cpu/ggml-cpu-hbm.h create mode 100644 ggml/src/ggml-cpu/ggml-cpu-traits.cpp create mode 100644 ggml/src/ggml-cpu/ggml-cpu-traits.h diff --git a/Makefile b/Makefile index d76c4ad53..bcea450e4 100644 --- a/Makefile +++ b/Makefile @@ -445,6 +445,10 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) MK_CFLAGS += -march=native -mtune=native HOST_CXXFLAGS += -march=native -mtune=native + # Usage AMX build test + #MK_CFLAGS += -march=graniterapids -mtune=graniterapids + #HOST_CXXFLAGS += -march=graniterapids -mtune=graniterapids + # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx #MK_CXXFLAGS += -mfma -mf16c -mavx @@ -948,7 +952,6 @@ DIR_COMMON = common OBJ_GGML = \ $(DIR_GGML)/src/ggml.o \ - $(DIR_GGML)/src/ggml-aarch64.o \ $(DIR_GGML)/src/ggml-alloc.o \ $(DIR_GGML)/src/ggml-backend.o \ $(DIR_GGML)/src/ggml-backend-reg.o \ @@ -956,9 +959,11 @@ OBJ_GGML = \ $(DIR_GGML)/src/ggml-quants.o \ $(DIR_GGML)/src/ggml-threading.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \ - $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \ + $(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \ $(OBJ_GGML_EXT) OBJ_LLAMA = \ @@ -1098,17 +1103,10 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d) # Default target all: $(BUILD_TARGETS) +# force c++ build for source file that have same name as c file # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files -# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml -$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ - ggml/src/ggml-cpu/ggml-cpu.cpp \ - ggml/include/ggml-backend.h \ - ggml/include/ggml.h \ - ggml/include/ggml-alloc.h \ - ggml/src/ggml-backend-impl.h \ - ggml/include/ggml-cpu.h \ - ggml/src/ggml-impl.h - $(CXX) $(CXXFLAGS) -c $< -o $@ +$(DIR_GGML)/%_cpp.o: $(DIR_GGML)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -c $< -o $@ # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c diff --git a/Package.swift b/Package.swift index d32b74a63..3afeb2f19 100644 --- a/Package.swift +++ b/Package.swift @@ -10,14 +10,15 @@ var sources = [ "src/unicode.cpp", "src/unicode-data.cpp", "ggml/src/ggml.c", - "ggml/src/ggml-aarch64.c", "ggml/src/ggml-alloc.c", "ggml/src/ggml-backend.cpp", "ggml/src/ggml-backend-reg.cpp", "ggml/src/ggml-cpu/ggml-cpu.c", "ggml/src/ggml-cpu/ggml-cpu.cpp", - "ggml/src/ggml-cpu/ggml-cpu-aarch64.c", + "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp", + "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp", "ggml/src/ggml-cpu/ggml-cpu-quants.c", + "ggml/src/ggml-cpu/ggml-cpu-traits.cpp", "ggml/src/ggml-threading.cpp", "ggml/src/ggml-quants.c", ] diff --git a/docs/build.md b/docs/build.md index a4964cbd1..26e673788 100644 --- a/docs/build.md +++ b/docs/build.md @@ -55,7 +55,7 @@ cmake --build build --config Release cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF cmake --build build-arm64-windows-llvm-release ``` - Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels. + Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. ## BLAS Build diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 704f0d56b..5d1e11c67 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis Several quantization methods are supported. They differ in the resulting model disk size and inference speed. -The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format. - *(outdated)* | Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index b98993210..8d47b17b6 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -48,9 +48,6 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e14ea9ea5..3aa71badb 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -103,24 +103,14 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks - typedef void (*ggml_from_float_to_mat_t) - (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); - typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); - typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, - const void * GGML_RESTRICT y, int nr, int nc); struct ggml_type_traits_cpu { ggml_from_float_t from_float; - ggml_from_float_to_mat_t from_float_to_mat; ggml_vec_dot_t vec_dot; enum ggml_type vec_dot_type; int64_t nrows; // number of rows to process simultaneously - int64_t ncols; // number of columns to process simultaneously - ggml_gemv_t gemv; - ggml_gemm_t gemm; }; GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); @@ -140,13 +130,6 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); -#ifdef GGML_USE_CPU_HBM - GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); -#endif - - GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); - GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft); - #ifdef __cplusplus } #endif diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1c8cc11b6..386d5a15d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -384,15 +384,15 @@ extern "C" { GGML_TYPE_F64 = 28, GGML_TYPE_IQ1_M = 29, GGML_TYPE_BF16 = 30, - GGML_TYPE_Q4_0_4_4 = 31, - GGML_TYPE_Q4_0_4_8 = 32, - GGML_TYPE_Q4_0_8_8 = 33, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_4 = 36, // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, - GGML_TYPE_COUNT, + GGML_TYPE_COUNT = 39, }; // precision @@ -433,9 +433,6 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors }; // available tensor operations: @@ -2205,11 +2202,19 @@ extern "C" { GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); -#ifdef __cplusplus -// restrict not standard in C++ -#define GGML_RESTRICT +#ifdef __cplusplus + // restrict not standard in C++ +# if defined(__GNUC__) +# define GGML_RESTRICT __restrict__ +# elif defined(__clang__) +# define GGML_RESTRICT __restrict +# elif defined(_MSC_VER) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT +# endif #else -#define GGML_RESTRICT restrict +# define GGML_RESTRICT restrict #endif typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index f07533fdb..a267a8b59 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -220,9 +220,7 @@ add_library(ggml-base ggml-threading.cpp ggml-threading.h ggml-quants.c - ggml-quants.h - ggml-aarch64.c - ggml-aarch64.h) + ggml-quants.h) target_include_directories(ggml-base PRIVATE .) diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c deleted file mode 100644 index 013912051..000000000 --- a/ggml/src/ggml-aarch64.c +++ /dev/null @@ -1,129 +0,0 @@ -#define GGML_COMMON_DECL_C -#include "ggml-common.h" - -#include "ggml-aarch64.h" -#include "ggml-impl.h" -#include "ggml-quants.h" -#include - -#define UNUSED GGML_UNUSED - -static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x4 out; - - for (int i = 0; i < 4; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 2 / blck_size_interleave; - - if (blck_size_interleave == 8) { - const uint64_t xor_mask = 0x8888888888888888ULL; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - // Using memcpy to avoid unaligned memory accesses - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - } else if (blck_size_interleave == 4) { - const uint32_t xor_mask = 0x88888888; - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint32_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); - } - } else { - GGML_ASSERT(false); - } - - return out; -} - -// interleave 8 block_q4_0s in blocks of blck_size_interleave -// returns an interleaved block_q4_0x8 -// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks -// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave -static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { - block_q4_0x8 out; - - for (int i = 0; i < 8; i++) { - out.d[i] = in[i].d; - } - - const int end = QK4_0 * 4 / blck_size_interleave; - const uint64_t xor_mask = 0x8888888888888888ULL; - - for (int i = 0; i < end; ++i) { - int src_id = i % 8; - int src_offset = (i / 8) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; - - uint64_t elems; - memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); - elems ^= xor_mask; - memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); - } - - return out; -} - -static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) { - assert(n_per_row % QK4_0 == 0); - const int nb = n_per_row / QK4_0; - - void * out_ptr = NULL; - if (nrows_interleaved == 8) { - out_ptr = (block_q4_0x8 *) dst; - } - else if (nrows_interleaved == 4) { - out_ptr = (block_q4_0x4 *) dst; - } - assert(nrows_interleaved <= 8); - block_q4_0 dst_tmp[8]; - - for (int b = 0; b < (nrow * n_per_row); b += nrows_interleaved * n_per_row) { - - for (int64_t x = 0; x < nb; x++) { - - for (int i = 0; i < nrows_interleaved; i++ ) { - quantize_row_q4_0_ref(src + b + i * n_per_row + x * QK4_0, (block_q4_0 *) dst_tmp + i, QK4_0); - } - - if (nrows_interleaved == 8) { - *(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave); - out_ptr = (block_q4_0x8 *) out_ptr + 1; - } - else if (nrows_interleaved == 4) { - *(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave); - out_ptr = (block_q4_0x4 *) out_ptr + 1; - } - } - } - - return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0)); -} - -size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4); -} - -size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8); -} - -size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { - UNUSED(quant_weights); - return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8); -} diff --git a/ggml/src/ggml-aarch64.h b/ggml/src/ggml-aarch64.h deleted file mode 100644 index a57868591..000000000 --- a/ggml/src/ggml-aarch64.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "ggml.h" - -// GGML internal header - -#ifdef __cplusplus -extern "C" { -#endif - -// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); - -#ifdef __cplusplus -} -#endif - diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 04e25b8ab..fa04ab84f 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2089,7 +2089,7 @@ static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, con static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { /* .get_name = */ ggml_backend_cann_reg_get_name, /* .get_device_count = */ ggml_backend_cann_reg_get_device_count, - /* .get_device_get = */ ggml_backend_cann_reg_get_device, + /* .get_device = */ ggml_backend_cann_reg_get_device, /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address, }; diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 27253a6c2..7fd2aadec 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -6,7 +6,20 @@ typedef uint16_t ggml_half; typedef uint32_t ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S + +#define GGML_COMMON_DECL +#elif defined(GGML_COMMON_DECL_CPP) +#include + +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +// std-c++ allow anonymous unions but some compiler warn on it +#define GGML_COMMON_AGGR_U data +// std-c++ do not allow it. +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) @@ -29,7 +43,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_HIP) @@ -39,7 +54,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_SYCL) @@ -49,7 +65,8 @@ typedef half2 ggml_half2; typedef sycl::half ggml_half; typedef sycl::half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #endif @@ -154,9 +171,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -175,9 +192,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -196,37 +213,13 @@ typedef struct { struct { ggml_half d; // delta ggml_half s; // d * sum(qs[i]) - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 ds; - }; + } GGML_COMMON_AGGR_U; int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); -typedef struct { - ggml_half d[4]; // deltas for 4 q4_0 blocks - uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks -} block_q4_0x4; -static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding"); - -typedef struct { - ggml_half d[8]; // deltas for 8 q4_0 blocks - uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks -} block_q4_0x8; -static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding"); - -typedef struct { - ggml_half d[4]; // deltas for 4 q8_0 blocks - int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks -} block_q8_0x4; -static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding"); - -typedef struct { - ggml_half d[8]; // deltas for 8 q8_0 blocks - int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks -} block_q8_0x8; -static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding"); - // // Ternary quantization // @@ -261,9 +254,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -288,9 +281,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -305,9 +298,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits @@ -418,12 +411,6 @@ typedef struct { } block_iq4_xs; static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); -typedef struct { - ggml_half d[4]; // deltas for 4 iq4_nl blocks - uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks -} block_iq4_nlx4; -static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); - #endif // GGML_COMMON_DECL #endif // GGML_COMMON_DECL @@ -437,6 +424,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_END() }; +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CPP) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_METAL) #include diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index bc326c059..0e0556703 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -10,10 +10,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) list (APPEND GGML_CPU_SOURCES ggml-cpu/ggml-cpu.c ggml-cpu/ggml-cpu.cpp - ggml-cpu/ggml-cpu-aarch64.c + ggml-cpu/ggml-cpu-aarch64.cpp ggml-cpu/ggml-cpu-aarch64.h + ggml-cpu/ggml-cpu-hbm.cpp + ggml-cpu/ggml-cpu-hbm.h ggml-cpu/ggml-cpu-quants.c ggml-cpu/ggml-cpu-quants.h + ggml-cpu/ggml-cpu-traits.cpp + ggml-cpu/ggml-cpu-traits.h ggml-cpu/amx/amx.cpp ggml-cpu/amx/amx.h ggml-cpu/amx/mmq.cpp diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 09c0df0f5..b9074cb3a 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -5,6 +5,7 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "ggml-cpu-traits.h" #if defined(__gnu_linux__) #include @@ -17,31 +18,65 @@ #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) +// AMX type_trais +namespace ggml::cpu::amx { +class tensor_traits : public ggml::cpu::tensor_traits { + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + size = ggml_backend_amx_desired_wsize(op); + return true; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT) { + ggml_backend_amx_mul_mat(params, op); + return true; + } + return false; + } +}; + +static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) { + static tensor_traits traits; + return &traits; +} +} // namespace ggml::cpu::amx + // AMX buffer interface static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { free(buffer->context); } static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)(buffer->context); + return (void *) (buffer->context); } -static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); +static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); GGML_UNUSED(buffer); } -static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + uint8_t value, size_t offset, size_t size) { + memset((char *) tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { if (qtype_has_amx_kernels(tensor->type)) { + GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type)); ggml_backend_amx_convert_weight(tensor, data, offset, size); } else { - memcpy((char *)tensor->data + offset, data, size); + memcpy((char *) tensor->data + offset, data, size); } GGML_UNUSED(buffer); } +/* +// need to figure what we need to do with buffer->extra. static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); memcpy(data, (const char *)tensor->data + offset, size); @@ -62,6 +97,7 @@ static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } +*/ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { memset(buffer->context, value, buffer->size); @@ -70,13 +106,13 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required + /* .init_tensor = */ ggml_backend_amx_buffer_init_tensor, /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, + /* .get_tensor = */ nullptr, + /* .cpy_tensor = */ nullptr, /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ NULL, + /* .reset = */ nullptr, }; static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { @@ -101,18 +137,48 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ GGML_UNUSED(buft); } -static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { +namespace ggml::cpu::amx { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; + + if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous + is_contiguous_2d(op->src[1]) && // src1 must be contiguous + op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && + op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x + (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { + // src1 must be host buffer + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + // src1 must be float32 + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && + op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } + + return nullptr; + } +}; +} // namespace ggml::cpu::amx + +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { return ggml_backend_amx_get_alloc_size(tensor); GGML_UNUSED(buft); } -static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - GGML_UNUSED(buft); -} - #define ARCH_GET_XCOMP_PERM 0x1022 #define ARCH_REQ_XCOMP_PERM 0x1023 #define XFEATURE_XTILECFG 17 @@ -129,68 +195,26 @@ static bool ggml_amx_init() { return true; #endif } + ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_amx_buffer_type_is_host, - }, + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ nullptr, + }, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, + /* .context = */ new ggml::cpu::amx::extra_buffer_type(), }; if (!ggml_amx_init()) { - return NULL; + return nullptr; } return &ggml_backend_buffer_type_amx; } -bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; -} - -bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) { - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; - - case GGML_OP_MUL_MAT: { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - const enum ggml_type type = src0->type; - const int64_t ne0 = op->ne[0]; - - // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 - // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 - bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); - - bool can_use_amx = - is_contiguous_2d(src0) && // src0 must be contiguous - is_contiguous_2d(src1) && // src1 must be contiguous - src1->type == GGML_TYPE_F32 && // src1 must be float32 - has_amx_kernels && // with amx kernel impls - ne0 % (TILE_N * 2) == 0; // out_features is 32x - - return can_use_amx; - } - default: - return false; - } -} - -#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) +#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h index c43546273..5b65d76bd 100644 --- a/ggml/src/ggml-cpu/amx/amx.h +++ b/ggml/src/ggml-cpu/amx/amx.h @@ -1,20 +1,8 @@ #include "ggml-backend.h" #include "ggml-cpu-impl.h" -#ifdef __cplusplus -extern "C" { -#endif +// GGML internal header #if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); -bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft); -bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op); -void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); -size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); - -#endif - -#ifdef __cplusplus -} #endif diff --git a/ggml/src/ggml-cpu/amx/common.h b/ggml/src/ggml-cpu/amx/common.h index 40074c3fc..f392e8985 100644 --- a/ggml/src/ggml-cpu/amx/common.h +++ b/ggml/src/ggml-cpu/amx/common.h @@ -7,7 +7,7 @@ #include #include -#if defined(_OPENMP) +#if defined(GGML_USE_OPENMP) #include #endif @@ -56,11 +56,11 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) { } template -inline void parallel_for(int nth, int n, const func_t& f) { -#if defined(_OPENMP) -#pragma omp parallel num_threads(nth) +inline void parallel_for(int n, const func_t& f) { +#if defined(GGML_USE_OPENMP) +#pragma omp parallel { - //int nth = omp_get_num_threads(); + int nth = omp_get_num_threads(); int ith = omp_get_thread_num(); int tbegin, tend; balance211(n, nth, ith, tbegin, tend); @@ -68,8 +68,6 @@ inline void parallel_for(int nth, int n, const func_t& f) { } #else f(0, n); - - GGML_UNUSED(nth); #endif } @@ -91,10 +89,3 @@ inline bool qtype_has_amx_kernels(const enum ggml_type type) { (type == GGML_TYPE_Q6_K) || (type == GGML_TYPE_IQ4_XS); } - -// ggml backend context -struct ggml_backend_amx_context { - int n_threads = GGML_DEFAULT_N_THREADS; - std::unique_ptr work_data; - size_t work_size = 0; -}; diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 0ec3aa86d..0ea91596b 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -18,10 +18,6 @@ #include #endif -#if defined(_OPENMP) -#include -#endif - #if (defined(_WIN32) || defined(_WIN64)) #define RESTRICT __restrict #else @@ -1382,13 +1378,13 @@ struct tinygemm_kernel_avx #define PACKED_INDEX(n, k, KB, tile_size) (n * KB + k) * tile_size template -void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K, int n_threads) { +void convert_B_packed_format(void * RESTRICT packed_B, const TB * RESTRICT B, int N, int K) { const int NB = N / TILE_N; const int KB = K / BLOCK_K; const int TILE_SIZE = get_tile_size(); // parallel on NB should be enough - parallel_for(n_threads, NB, [&](int begin, int end) { + parallel_for(NB, [&](int begin, int end) { for (int n = begin; n < end; ++n) { for (int k = 0; k < KB; ++k) { int n0 = n * TILE_N; @@ -2334,15 +2330,8 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d const int K = tensor->ne[0]; // ne0: in_features const int N = tensor->ne[1]; // ne1: out_features -#if defined(_OPENMP) - // the buffer ctx is not initialized when .set_tensor is called - int n_threads = omp_get_num_threads(); -#else - int n_threads = 1; -#endif - GGML_DISPATCH_QTYPES(TYPE, [&] { - convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K, n_threads); + convert_B_packed_format((void *)((char *)tensor->data + offset), (const type *)data, N, K); }); } diff --git a/ggml/src/ggml-cpu/amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h index f37366093..baf768477 100644 --- a/ggml/src/ggml-cpu/amx/mmq.h +++ b/ggml/src/ggml-cpu/amx/mmq.h @@ -1,16 +1,10 @@ #pragma once #include "common.h" -#ifdef __cplusplus -extern "C" { -#endif +size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp similarity index 85% rename from ggml/src/ggml-cpu/ggml-cpu-aarch64.c rename to ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 11152385e..9b9e3c92a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -1,20 +1,57 @@ -#define GGML_COMMON_IMPL_C +#define GGML_COMMON_IMPL_CPP +#define GGML_COMMON_DECL_CPP #include "ggml-common.h" +#include "ggml-backend-impl.h" #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" -#include "ggml-cpu/ggml-cpu-impl.h" +#include "ggml-cpu-impl.h" +#include "ggml-cpu-traits.h" -#include -#include -#include -#include -#include // for qsort -#include // for GGML_ASSERT +#include +#include +#include +#include +#include // for qsort +#include // for GGML_ASSERT #include "ggml-cpu-aarch64.h" +// TODO: move to include file? +template constexpr int QK_0() { + if constexpr (K == 4) { + return QK4_0; + } + if constexpr (K == 8) { + return QK8_0; + } + return -1; +} + +template struct block { + ggml_half d[N]; // deltas for N qK_0 blocks + int8_t qs[(QK_0() * N * K) / 8]; // quants for N qK_0 blocks +}; + +// control size +static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding"); +static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding"); +static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding"); +static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding"); + +using block_q4_0x4 = block<4, 4>; +using block_q4_0x8 = block<4, 8>; +using block_q8_0x4 = block<8, 4>; +using block_q8_0x8 = block<8, 8>; + +struct block_iq4_nlx4 { + ggml_half d[4]; // deltas for 4 iq4_nl blocks + uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks +}; + +static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding"); + #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Woverlength-strings" #elif defined(_MSC_VER) @@ -185,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y) static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) { +static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -279,12 +316,12 @@ static void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int6 #endif } -static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int64_t k) { +static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0x4 * restrict y = (block_q8_0x4 *) vy; + block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; #if defined(__ARM_NEON) float32x4_t srcv[4][8]; @@ -494,7 +531,7 @@ static void quantize_q8_0_4x8(const float * restrict x, void * restrict vy, int6 #endif } -void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { +static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) { assert(nrow == 4); UNUSED(nrow); if (blck_size_interleave == 4) { @@ -506,7 +543,7 @@ void quantize_mat_q8_0(const float * restrict x, void * restrict vy, int64_t nro } } -void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -591,7 +628,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -701,7 +738,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -974,7 +1011,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1070,7 +1107,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void } } -void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -1586,7 +1623,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -2040,7 +2077,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 8; @@ -2560,31 +2597,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31) // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) - const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) + const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) + const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) - const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) - const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) + const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) + const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) - const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) - const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) + const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) + const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) - const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) - const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) + const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) + const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) - const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) + const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) + const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) - const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) - const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) + const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) + const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) - const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) - const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) + const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) + const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) - const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) - const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) + const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) + const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) // Scale values - Load the weight scale values of two block_q4_0x8 const __m512 col_scale_f32 = GGML_F32Cx8x2_LOAD(b_ptr_0[b].d, b_ptr_1[b].d); @@ -2618,31 +2655,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Shuffle pattern one - left side input - const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) - const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) + const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) + const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) - const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) - const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) + const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) + const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) - const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) - const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) + const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) + const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) - const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) - const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) + const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) + const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) // Shuffle pattern two - left side input - const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) - const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) + const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) + const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) - const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) - const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) + const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) + const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) - const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) - const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) + const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) + const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) - const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) - const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) + const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) + const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane // Resembles MMLAs into 2x2 matrices in ARM Version @@ -2671,10 +2708,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Straighten out to make 4 row vectors - __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78)); - __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01); - __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78)); - __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11); + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68); @@ -2753,31 +2790,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * const __m512i rhs_mat_2367ABEF_3 = _mm512_shuffle_epi8(signextendlutexpanded, _mm512_and_si512(_mm512_srli_epi16(rhs_raw_mat_2367ABEF_1, 4), m4bexpanded)); //B2(24-31) B3(24-31) B6(24-31) B7(24-31) BA(24-31) BB(24-31) BE(24-31) BF(24-31) // Shuffle pattern one - right side input - const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) - const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) + const __m512i rhs_mat_014589CD_0_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)136); //B0(0-3) B1(0-3) B0(0-3) B1(0-3) B4(0-3) B5(0-3) B4(0-3) B5(0-3) B8(0-3) B9(0-3) B8(0-3) B9(0-3) BC(0-3) BD(0-3) BC(0-3) BD(0-3) + const __m512i rhs_mat_2367ABEF_0_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)136); //B2(0-3) B3(0-3) B2(0-3) B3(0-3) B6(0-3) B7(0-3) B6(0-3) B7(0-3) BA(0-3) BB(0-3) BA(0-3) BB(0-3) BE(0-3) BF(0-3) BE(0-3) BF(0-3) - const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) - const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) + const __m512i rhs_mat_014589CD_1_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)136); //B0(8-11) B1(8-11) B0(8-11) B1(8-11) B4(8-11) B5(8-11) B4(8-11) B5(8-11) B8(8-11) B9(8-11) B8(8-11) B9(8-11) BC(8-11) BD(8-11) BC(8-11) BD(8-11) + const __m512i rhs_mat_2367ABEF_1_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)136); //B2(8-11) B3(8-11) B2(8-11) B3(8-11) B6(8-11) B7(8-11) B6(8-11) B7(8-11) BA(8-11) BB(8-11) BA(8-11) BB(8-11) BE(8-11) BF(8-11) BE(8-11) BF(8-11) - const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) - const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) + const __m512i rhs_mat_014589CD_2_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)136); //B0(16-19) B1(16-19) B0(16-19) B1(16-19) B4(16-19) B5(16-19) B4(16-19) B5(16-19) B8(16-19) B9(16-19) B8(16-19) B9(16-19) BC(16-19) BD(16-19) BC(16-19) BD(16-19) + const __m512i rhs_mat_2367ABEF_2_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)136); //B2(16-19) B3(16-19) B2(16-19) B3(16-19) B6(16-19) B7(16-19) B6(16-19) B7(16-19) BA(16-19) BB(16-19) BA(16-19) BB(16-19) BE(16-19) BF(16-19) BE(16-19) BF(16-19) - const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) - const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) + const __m512i rhs_mat_014589CD_3_sp1 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)136); //B0(24-27) B1(24-27) B0(24-27) B1(24-27) B4(24-27) B5(24-27) B4(24-27) B5(24-27) B8(24-27) B9(24-27) B8(24-27) B9(24-27) BC(24-27) BD(24-27) BC(24-27) BD(24-27) + const __m512i rhs_mat_2367ABEF_3_sp1 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)136); //B2(24-27) B3(24-27) B2(24-27) B3(24-27) B6(24-27) B7(24-27) B6(24-27) B7(24-27) BA(24-27) BB(24-27) BA(24-27) BB(24-27) BE(24-27) BF(24-27) BE(24-27) BF(24-27) // Shuffle pattern two - right side input - const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, 221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) - const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, 221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) + const __m512i rhs_mat_014589CD_0_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_0, (_MM_PERM_ENUM)221); //B0(4-7) B1(4-7) B0(4-7) B1(4-7) B4(4-7) B5(4-7) B4(4-7) B5(4-7) B8(4-7) B9(4-7) B8(4-7) B9(4-7) BC(4-7) BD(4-7) BC(4-7) BD(4-7) + const __m512i rhs_mat_2367ABEF_0_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_0, (_MM_PERM_ENUM)221); //B2(4-7) B3(4-7) B2(4-7) B3(4-7) B6(4-7) B7(4-7) B6(4-7) B7(4-7) BA(4-7) BB(4-7) BA(4-7) BB(4-7) BE(4-7) BF(4-7) BE(4-7) BF(4-7) - const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, 221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) - const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, 221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) + const __m512i rhs_mat_014589CD_1_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_1, (_MM_PERM_ENUM)221); //B0(12-15) B1(12-15) B0(12-15) B1(12-15) B4(12-15) B5(12-15) B4(12-15) B5(12-15) B8(12-15) B9(12-15) B8(12-15) B9(12-15) BC(12-15) BD(12-15) BC(12-15) BD(12-15) + const __m512i rhs_mat_2367ABEF_1_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_1, (_MM_PERM_ENUM)221); //B2(12-15) B3(12-15) B2(12-15) B3(12-15) B6(12-15) B7(12-15) B6(12-15) B7(12-15) BA(12-15) BB(12-15) BA(12-15) BB(12-15) BE(12-15) BF(12-15) BE(12-15) BF(12-15) - const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, 221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) - const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, 221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) + const __m512i rhs_mat_014589CD_2_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_2, (_MM_PERM_ENUM)221); //B0(20-23) B1(20-23) B0(20-23) B1(20-23) B4(20-23) B5(20-23) B4(20-23) B5(20-23) B8(20-23) B9(20-23) B8(20-23) B9(20-23) BC(20-23) BD(20-23) BC(20-23) BD(20-23) + const __m512i rhs_mat_2367ABEF_2_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_2, (_MM_PERM_ENUM)221); //B2(20-23) B3(20-23) B2(20-23) B3(20-23) B6(20-23) B7(20-23) B6(20-23) B7(20-23) BA(20-23) BB(20-23) BA(20-23) BB(20-23) BE(20-23) BF(20-23) BE(20-23) BF(20-23) - const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, 221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) - const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, 221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) + const __m512i rhs_mat_014589CD_3_sp2 = _mm512_shuffle_epi32(rhs_mat_014589CD_3, (_MM_PERM_ENUM)221); //B0(28-31) B1(28-31) B0(28-31) B1(28-31) B4(28-31) B5(28-31) B4(28-31) B5(28-31) B8(28-31) B9(28-31) B8(28-31) B9(28-31) BC(28-31) BD(28-31) BC(28-31) BD(28-31) + const __m512i rhs_mat_2367ABEF_3_sp2 = _mm512_shuffle_epi32(rhs_mat_2367ABEF_3, (_MM_PERM_ENUM)221); //B2(28-31) B3(28-31) B2(28-31) B3(28-31) B6(28-31) B7(28-31) B6(28-31) B7(28-31) BA(28-31) BB(28-31) BA(28-31) BB(28-31) BE(28-31) BF(28-31) BE(28-31) BF(28-31) // Scale values - Load the weight scale values of two block_q4_0x8 @@ -2809,31 +2846,31 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Shuffle pattern one - left side input - const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, 160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) - const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, 160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) + const __m512i lhs_mat_01_0_sp1 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)160); //A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) A0(0-3) A0(0-3) A1(0-3) A1(0-3) + const __m512i lhs_mat_23_0_sp1 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)160); //A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) A2(0-3) A2(0-3) A3(0-3) A3(0-3) - const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, 160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) - const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, 160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) + const __m512i lhs_mat_01_1_sp1 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)160); //A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) A0(8-11) A0(8-11) A1(8-11) A1(8-11) + const __m512i lhs_mat_23_1_sp1 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)160); //A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) A2(8-11) A2(8-11) A3(8-11) A3(8-11) - const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, 160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) - const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, 160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) + const __m512i lhs_mat_01_2_sp1 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)160); //A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) A0(16-19) A0(16-19) A1(16-19) A1(16-19) + const __m512i lhs_mat_23_2_sp1 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)160); //A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) A2(16-19) A2(16-19) A3(16-19) A3(16-19) - const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, 160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) - const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, 160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) + const __m512i lhs_mat_01_3_sp1 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)160); //A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) A0(24-27) A0(24-27) A1(24-27) A1(24-27) + const __m512i lhs_mat_23_3_sp1 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)160); //A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) A2(24-27) A2(24-27) A3(24-27) A3(24-27) // Shuffle pattern two - left side input - const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, 245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) - const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, 245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) + const __m512i lhs_mat_01_0_sp2 = _mm512_shuffle_epi32(lhs_mat_01_0, (_MM_PERM_ENUM)245); //A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) A0(4-7) A0(4-7) A1(4-7) A1(4-7) + const __m512i lhs_mat_23_0_sp2 = _mm512_shuffle_epi32(lhs_mat_23_0, (_MM_PERM_ENUM)245); //A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) A2(4-7) A2(4-7) A3(4-7) A3(4-7) - const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, 245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) - const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, 245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) + const __m512i lhs_mat_01_1_sp2 = _mm512_shuffle_epi32(lhs_mat_01_1, (_MM_PERM_ENUM)245); //A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) A0(12-15) A0(12-15) A1(12-15) A1(12-15) + const __m512i lhs_mat_23_1_sp2 = _mm512_shuffle_epi32(lhs_mat_23_1, (_MM_PERM_ENUM)245); //A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) A2(12-15) A2(12-15) A3(12-15) A3(12-15) - const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, 245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) - const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, 245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) + const __m512i lhs_mat_01_2_sp2 = _mm512_shuffle_epi32(lhs_mat_01_2, (_MM_PERM_ENUM)245); //A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) A0(20-23) A0(20-23) A1(20-23) A1(20-23) + const __m512i lhs_mat_23_2_sp2 = _mm512_shuffle_epi32(lhs_mat_23_2, (_MM_PERM_ENUM)245); //A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) A2(20-23) A2(20-23) A3(20-23) A3(20-23) - const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, 245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) - const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, 245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) + const __m512i lhs_mat_01_3_sp2 = _mm512_shuffle_epi32(lhs_mat_01_3, (_MM_PERM_ENUM)245); //A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) A0(28-31) A0(28-31) A1(28-31) A1(28-31) + const __m512i lhs_mat_23_3_sp2 = _mm512_shuffle_epi32(lhs_mat_23_3, (_MM_PERM_ENUM)245); //A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) A2(28-31) A2(28-31) A3(28-31) A3(28-31) // The values arranged in shuffle patterns are operated with dot product operation within 32 bit lane i.e corresponding bytes and multiplied and added into 32 bit integers within 32 bit lane // Resembles MMLAs into 2x2 matrices in ARM Version @@ -2862,10 +2899,10 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * // Straighten out to make 4 row vectors - __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, 78)); - __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, 78), iacc_mat_01); - __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, 78)); - __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, 78), iacc_mat_11); + __m512i iacc_row_0 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_00, _mm512_shuffle_epi32(iacc_mat_01, (_MM_PERM_ENUM)78)); + __m512i iacc_row_1 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_00, (_MM_PERM_ENUM)78), iacc_mat_01); + __m512i iacc_row_2 = _mm512_mask_blend_epi32(0xCCCC, iacc_mat_10, _mm512_shuffle_epi32(iacc_mat_11, (_MM_PERM_ENUM)78)); + __m512i iacc_row_3 = _mm512_mask_blend_epi32(0xCCCC, _mm512_shuffle_epi32(iacc_mat_10, (_MM_PERM_ENUM)78), iacc_mat_11); // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68); @@ -3460,7 +3497,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * } } -void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) { +static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { const int qk = QK8_0; const int nb = n / qk; const int ncols_interleaved = 4; @@ -3571,7 +3608,6 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * restrict s, size_t bs, const void } } -// FIXME: this code is duplicated from ggml-aarch64.c static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { block_q4_0x4 out; @@ -3641,20 +3677,20 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in return out; } -static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + constexpr int nrows_interleaved = 4; block_q4_0x4 * dst = (block_q4_0x4 *)t->data; const block_q4_0 * src = (const block_q4_0 *)data; block_q4_0 dst_tmp[4]; - int nrow = t->ne[1]; // Number of rows - int nrows_interleaved = 4; + int nrow = ggml_nrows(t); int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3672,20 +3708,20 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block GGML_UNUSED(data_size); } -static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_Q4_0); GGML_ASSERT(interleave_block == 8); + constexpr int nrows_interleaved = 8; block_q4_0x8 * dst = (block_q4_0x8*)t->data; const block_q4_0 * src = (const block_q4_0*) data; block_q4_0 dst_tmp[8]; - int nrow = t->ne[1]; // Number of rows - int nrows_interleaved = 8; + int nrow = ggml_nrows(t); int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3736,20 +3772,20 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s return out; } -static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) { +static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); GGML_ASSERT(interleave_block == 4 || interleave_block == 8); block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; const block_iq4_nl * src = (const block_iq4_nl *)data; block_iq4_nl dst_tmp[4]; - int nrow = t->ne[1]; // Number of rows + int nrow = ggml_nrows(t); int nrows_interleaved = 4; int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); - if (nrow % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { + if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { return -1; } @@ -3767,57 +3803,456 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b GGML_UNUSED(data_size); } -// Prepare for optimized kernels if applicable -void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) { - if (cur->type == repack_type) { - memcpy(cur->data, data, data_size); - return; - } +namespace ggml::cpu::aarch64 { +// repack +template +int repack(struct ggml_tensor *, const void *, size_t); - if (cur->type == GGML_TYPE_Q4_0) { - switch (repack_type) { - case GGML_TYPE_Q4_0_8_8: - repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_8: - repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size); - break; - case GGML_TYPE_Q4_0_4_4: - repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size); - break; - default: - GGML_ABORT("Unsupported type"); - } - } else if (cur->type == GGML_TYPE_IQ4_NL) { - switch (repack_type) { - case GGML_TYPE_IQ4_NL_4_4: - repack_iq4_nl_to_iq4_nl_4_bl(cur, 4, data, data_size); - break; - default: - GGML_ABORT("Unsupported type"); - } - } else { - GGML_ABORT("Unsupported type"); - } +// TODO: generalise. +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); } -enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); +} + +template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { + return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +} + +// gemv +template +void gemv(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemv(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +// gemm +template +void gemm(int, float *, size_t, const void *, const void *, int, int); + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); +} + +template <> +void gemm(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { + ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); +} + +class tensor_traits_base : public ggml::cpu::tensor_traits { + public: + virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; +}; + +template class tensor_traits : public tensor_traits_base { + + bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override { + // not realy a GGML_TYPE_Q8_0 but same size. + switch (op->op) { + case GGML_OP_MUL_MAT: + size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); + return true; + case GGML_OP_MUL_MAT_ID: + size = ggml_row_size(GGML_TYPE_Q8_0, ggml_nelements(op->src[1])); + size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc. + size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2]; + return true; + default: + // GGML_ABORT("fatal error"); + break; + } + return false; + } + + bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { + switch (op->op) { + case GGML_OP_MUL_MAT: + forward_mul_mat(params, op); + return true; + case GGML_OP_MUL_MAT_ID: + forward_mul_mat_id(params, op); + return true; + default: + // GGML_ABORT("fatal error"); + break; + } + return false; + } + + void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); + // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2); + + char * wdata = static_cast(params->wdata); + const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); + + assert(params->wsize >= nbw1 * ne11); + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; + + int64_t i11_processed = 0; + for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { + quantize_mat_q8_0((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10, + INTER_SIZE); + } + i11_processed = ne11 - ne11 % 4; + for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); + } + + ggml_barrier(params->threadpool); + + const void * src1_wdata = params->wdata; + const size_t src1_col_stride = ggml_row_size(GGML_TYPE_Q8_0, ne10); + int64_t src0_start = (ith * ne01) / nth; + int64_t src0_end = ((ith + 1) * ne01) / nth; + src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; + src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; + if (src0_start >= src0_end) { + return; + } + + // If there are more than three rows in src1, use gemm; otherwise, use gemv. + if (ne11 > 3) { + gemm(ne00, (float *) ((char *) dst->data) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + } + for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { + gemv(ne00, (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, + (const char *) src0->data + src0_start * nb01, + (const char *) src1_wdata + (src1_col_stride * iter), 1, + src0_end - src0_start); + } + } + + void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { + const ggml_tensor * src0 = op->src[0]; + const ggml_tensor * src1 = op->src[1]; + const ggml_tensor * ids = op->src[2]; + ggml_tensor * dst = op; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int ith = params->ith; + const int nth = params->nth; + + const ggml_from_float_t from_float = ggml_get_type_traits_cpu(GGML_TYPE_Q8_0)->from_float; + + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); + GGML_ASSERT(nb10 == ggml_type_size(src1->type)); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne03 == 1); + GGML_ASSERT(ne13 == 1); + GGML_ASSERT(ne3 == 1); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // row groups + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert + + const size_t nbw1 = ggml_row_size(GGML_TYPE_Q8_0, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; + + GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) + + n_as * ne12 * sizeof(mmid_row_mapping))); + + auto wdata = (char *) params->wdata; + auto wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t)); + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12] + + // src1: float32 => block_q8_0 + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), + (void *) (wdata + i12 * nbw2 + i11 * nbw1), + ne10); + } + } + +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] + + if (ith == 0) { + // initialize matrix_row_counts + memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); + + // group rows by src0 matrix + for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int32_t id = 0; id < n_ids; ++id) { + const int32_t i02 = + *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; + matrix_row_counts[i02] += 1; + } + } + } + + ggml_barrier(params->threadpool); + + // compute each matrix multiplication in sequence + for (int cur_a = 0; cur_a < n_as; ++cur_a) { + const int64_t cne1 = matrix_row_counts[cur_a]; + + if (cne1 == 0) { + continue; + } + + auto src0_cur = (const char *) src0->data + cur_a*nb02; + + //const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows + + int64_t src0_cur_start = (ith * ne01) / nth; + int64_t src0_cur_end = ((ith + 1) * ne01) / nth; + src0_cur_start = + (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; + src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; + + if (src0_cur_start >= src0_cur_end) return; + + for (int ir1 = 0; ir1 < nr1; ir1++) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); + const int id = row_mapping.i1; // selected expert index + + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 + + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row + + auto src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); + + gemv( + ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, + ne01, src0_cur + src0_cur_start * nb01, + src1_col, 1, src0_cur_end - src0_cur_start); + } + } +#undef MMID_MATRIX_ROW + } + + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { + GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), + (int) NB_COLS, (int) INTER_SIZE); + return ggml::cpu::aarch64::repack(t, data, data_size); + } +}; + +// instance for Q4 +static const tensor_traits q4_0_4x4_q8_0; +static const tensor_traits q4_0_4x8_q8_0; +static const tensor_traits q4_0_8x8_q8_0; + +// instance for IQ4 +static const tensor_traits iq4_nl_4x4_q8_0; + +} // namespace ggml::cpu::aarch64 + +static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) { if (cur->type == GGML_TYPE_Q4_0) { - // TODO: enable for AVX2 - currently disabled due to bad gemv performance - if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { - return GGML_TYPE_Q4_0_8_8; + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (cur->ne[1] % 8 == 0) { + return &ggml::cpu::aarch64::q4_0_8x8_q8_0; + } } if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - return GGML_TYPE_Q4_0_4_8; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::q4_0_4x8_q8_0; + } } if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - return GGML_TYPE_Q4_0_4_4; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::q4_0_4x4_q8_0; + } } } else if (cur->type == GGML_TYPE_IQ4_NL) { if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - return GGML_TYPE_IQ4_NL_4_4; + if (cur->ne[1] % 4 == 0) { + return &ggml::cpu::aarch64::iq4_nl_4x4_q8_0; + } } } - return cur->type; + return nullptr; +} + +static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + tensor->extra = (void *) const_cast(ggml_aarch64_get_optimal_repack_type(tensor)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, + const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + auto tensor_traits = (ggml::cpu::aarch64::tensor_traits_base *) tensor->extra; + auto OK = tensor_traits->repack(tensor, data, size); + + GGML_ASSERT(OK == 0); + GGML_UNUSED(buffer); +} + +static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_AARCH64"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + + if (buffer == nullptr) { + return nullptr; + } + + buffer->buft = buft; + buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; + buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; + return buffer; +} + +static size_t ggml_backend_cpu_aarch64_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +namespace ggml::cpu::aarch64 { +class extra_buffer_type : ggml::cpu::extra_buffer_type { + bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { + if ( op->op == GGML_OP_MUL_MAT && + op->src[0]->buffer && + (ggml_n_dims(op->src[0]) == 2) && + op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && + ggml_aarch64_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + // may be possible if Q8_0 packed... + } else if (op->op == GGML_OP_MUL_MAT_ID + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 3) + && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() + && ggml_aarch64_get_optimal_repack_type(op->src[0]) + ) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[1]->type == GGML_TYPE_F32) { + return true; + } + //if (op->src[1]->type == GGML_TYPE_Q8_0) { + // return true; + //} + } + return false; + } + + ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) { + return (ggml::cpu::tensor_traits *) op->src[0]->extra; + } + } + return nullptr; + } +}; +} // namespace ggml::cpu::aarch64 + +ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_aarch64_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ nullptr, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ new ggml::cpu::aarch64::extra_buffer_type(), + }; + + return &ggml_backend_cpu_buffer_type_aarch64; } diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h index 3d9db6a19..6e84c826b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.h @@ -1,32 +1,8 @@ #pragma once +#include "ggml-cpu-traits.h" #include "ggml.h" // GGML internal header -#ifdef __cplusplus -extern "C" { -#endif - -// Quantization -void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave); - -// GEMV -void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); - -// GEMM -void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); -void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc); - -void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size); -enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur); - -#ifdef __cplusplus -} -#endif - +ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void); diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp new file mode 100644 index 000000000..fa8dea2af --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp @@ -0,0 +1,55 @@ +#ifdef GGML_USE_CPU_HBM + +#include "ggml-backend.h" +#include "ggml-backend-impl.h" +#include "ggml-cpu.h" +#include "ggml-impl.h" + +#include "ggml-cpu-hbm.h" + +// buffer type HBM + +#include + +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { + hbw_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + void * ptr; + int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); + if (result != 0) { + GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); + return NULL; + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { + static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_max_size = */ nullptr, // defaults to SIZE_MAX + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cpu_buffer_type_hbm; +} +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-hbm.h b/ggml/src/ggml-cpu/ggml-cpu-hbm.h new file mode 100644 index 000000000..09a1f09d7 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-hbm.h @@ -0,0 +1,8 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +// GGML CPU internal header + +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.cpp b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp new file mode 100644 index 000000000..62a0712da --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.cpp @@ -0,0 +1,36 @@ +#include "ggml-cpu-traits.h" + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +namespace ggml::cpu { +tensor_traits::~tensor_traits() {} + +extra_buffer_type::~extra_buffer_type() {} +} // namespace ggml::cpu + +bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra->context) { + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; + auto tensor_traits = buf_extra->get_tensor_traits(op); + if (tensor_traits && tensor_traits->compute_forward(params, op)) { + return true; + } + } + } + return false; +} + +bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra->context) { + auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context; + auto tensor_traits = buf_extra->get_tensor_traits(op); + if (tensor_traits && tensor_traits->work_size(n_threads, op, *size)) { + return true; + } + } + } + return false; +} diff --git a/ggml/src/ggml-cpu/ggml-cpu-traits.h b/ggml/src/ggml-cpu/ggml-cpu-traits.h new file mode 100644 index 000000000..99a6186b1 --- /dev/null +++ b/ggml/src/ggml-cpu/ggml-cpu-traits.h @@ -0,0 +1,38 @@ +#pragma once +#include "ggml-backend-impl.h" +#include "ggml-cpu-impl.h" +#include "ggml.h" + +#ifdef __cplusplus +# include +extern "C" { +#endif + +// return true if op part of extra "accelerator" +bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op); +bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size); + +#ifdef __cplusplus +} + +namespace ggml::cpu { +// register in tensor->extra +class tensor_traits { + public: + virtual ~tensor_traits(); + virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0; + virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0; +}; + +class extra_buffer_type { + public: + virtual ~extra_buffer_type(); + virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0; + virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0; +}; +} // namespace ggml::cpu + +// implemented in ggml-cpu.cpp. +std::vector & ggml_backend_cpu_get_extra_buffers_type(); + +#endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f12b62e3b..ea17d6077 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -3,7 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu-aarch64.h" +#include "ggml-cpu-traits.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" #include "ggml-impl.h" @@ -224,10 +224,6 @@ typedef void * thread_ret_t; typedef pthread_t ggml_thread_t; -#ifdef GGML_USE_CPU_HBM -#include -#endif - #if defined(__APPLE__) #include #include @@ -301,7 +297,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { }, [GGML_TYPE_Q8_0] = { .from_float = quantize_row_q8_0, - .from_float_to_mat = quantize_mat_q8_0, .vec_dot = ggml_vec_dot_q8_0_q8_0, .vec_dot_type = GGML_TYPE_Q8_0, #if defined (__ARM_FEATURE_MATMUL_INT8) @@ -409,33 +404,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_BF16, .nrows = 1, }, - [GGML_TYPE_Q4_0_4_4] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x4_q8_0, - .gemm = ggml_gemm_q4_0_4x4_q8_0, - }, - [GGML_TYPE_Q4_0_4_8] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_q4_0_4x8_q8_0, - .gemm = ggml_gemm_q4_0_4x8_q8_0, - }, - [GGML_TYPE_Q4_0_8_8] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 8, - .gemv = ggml_gemv_q4_0_8x8_q8_0, - .gemm = ggml_gemm_q4_0_8x8_q8_0, - }, [GGML_TYPE_TQ1_0] = { .from_float = quantize_row_tq1_0, .vec_dot = ggml_vec_dot_tq1_0_q8_K, @@ -448,15 +416,6 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, - [GGML_TYPE_IQ4_NL_4_4] = { - .from_float = NULL, - .vec_dot = NULL, - .vec_dot_type = GGML_TYPE_Q8_0, - .nrows = 1, - .ncols = 4, - .gemv = ggml_gemv_iq4_nl_4x4_q8_0, - .gemm = ggml_gemm_iq4_nl_4x4_q8_0, - }, }; const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) { @@ -4509,9 +4468,6 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -4889,9 +4845,6 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -5019,9 +4972,6 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -7437,27 +7387,9 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; - enum ggml_type type = src0->type; - - if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { - type = (enum ggml_type)(intptr_t)src0->extra; - } - -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { - ggml_backend_amx_mul_mat(params, dst); - return; - } -#endif - - enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; + enum ggml_type const vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; - int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows; - int64_t const matmul_num_cols = type_traits_cpu[type].ncols; - int64_t const blck_size_interleave = ggml_get_type_traits(type)->blck_size_interleave; - ggml_gemv_t const gemv = type_traits_cpu[type].gemv; - ggml_gemm_t const gemm = type_traits_cpu[type].gemm; + int64_t const vec_dot_num_rows = type_traits_cpu[src0->type].nrows; GGML_ASSERT(ne0 == ne01); GGML_ASSERT(ne1 == ne11); @@ -7465,7 +7397,7 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == ggml_type_size(type)); + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); // dst cannot be transposed or permuted @@ -7477,6 +7409,7 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + // TODO: extract to "extra_op" #if GGML_USE_LLAMAFILE // broadcast factors const int64_t r2 = ne12 / ne02; @@ -7487,15 +7420,15 @@ static void ggml_compute_forward_mul_mat( if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), + nb01/ggml_type_size(src0->type), (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - type, + src0->type, src1->type, dst->type)) goto UseGgmlGemm1; @@ -7516,19 +7449,10 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { - int64_t i11_processed = 0; - if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) { - for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { - from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - 4, ne10, blck_size_interleave); - } - i11_processed = ne11 - ne11 % 4; - } - for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); } } } @@ -7548,15 +7472,15 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), + nb01/ggml_type_size(src0->type), (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, - type, + src0->type, vec_dot_type, dst->type)) goto UseGgmlGemm2; @@ -7598,28 +7522,6 @@ UseGgmlGemm2:; const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; - if ((ggml_n_dims(src0) == 2) && gemv) { - const void * src1_wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11; - int64_t src0_start = (ith * ne01) / nth; - int64_t src0_end = ((ith + 1) * ne01) / nth; - src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start; - src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end; - if (src0_start >= src0_end) return; - - // If there are more than three rows in src1, use gemm; otherwise, use gemv. - if (gemm && (ne11 > 3)) { - gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01, - (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); - } - for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) { - gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01, - (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1, - src0_end - src0_start); - } - return; - } - // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; @@ -7642,7 +7544,7 @@ UseGgmlGemm2:; num_rows_per_vec_dot = 1; } - ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); if (nth >= nchunk0 * nchunk1) { break; @@ -7674,8 +7576,6 @@ static void ggml_compute_forward_mul_mat_id( ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; - int64_t const matmul_num_cols = type_traits_cpu[type].ncols; - ggml_gemv_t const gemv = type_traits_cpu[type].gemv; // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); @@ -7761,34 +7661,6 @@ static void ggml_compute_forward_mul_mat_id( const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = cne1; // src1 rows - if (((ggml_n_dims(src0) - 1) == 2) && gemv) { - int64_t src0_cur_start = (ith * ne01) / nth; - int64_t src0_cur_end = ((ith + 1) * ne01) / nth; - src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start; - src0_cur_end = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end; - if (src0_cur_start >= src0_cur_end) return; - - for (int ir1 = 0; ir1 < nr1; ir1++) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); - const int id = row_mapping.i1; // selected expert index - - const int64_t i11 = id % ne11; - const int64_t i12 = row_mapping.i2; // row index in src1 - - const int64_t i1 = id; // selected expert index - const int64_t i2 = i12; // row - - const char * src1_col = (const char *) wdata + - (src1_cont || src1->type != vec_dot_type - ? (i11 + i12 * ne11) * row_size - : (i11 * nb11 + i12 * nb12)); - - gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, - (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start); - } - continue; - } - // distribute the thread work across the inner or outer loop based on which one is larger const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows @@ -8096,9 +7968,6 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -8361,9 +8230,6 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: default: { GGML_ABORT("fatal error"); @@ -8625,9 +8491,6 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -9217,10 +9080,6 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ2_S: case GGML_TYPE_Q8_K: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - case GGML_TYPE_IQ4_NL_4_4: case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: @@ -12426,6 +12285,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } + // extra_buffer op? + if (ggml_cpu_extra_compute_forward(params, tensor)) return; + switch (tensor->op) { case GGML_OP_DUP: { @@ -13373,146 +13235,142 @@ struct ggml_cplan ggml_graph_plan( size_t cur = 0; - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - if (ggml_is_quantized(node->type) || - // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 - (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || - (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { + if (!ggml_cpu_extra_work_size(n_threads, node, &cur)) { + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + if (ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || + (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; + } + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_ACC: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; + } + } break; + case GGML_OP_COUNT_EQUAL: + { + cur = ggml_type_size(node->type)*n_tasks; + } break; + case GGML_OP_MUL_MAT: + { + const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; + + if (node->src[1]->type != vec_dot_type) { + cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + } + } break; + case GGML_OP_MUL_MAT_ID: + { + cur = 0; + const struct ggml_tensor * src0 = node->src[0]; + const struct ggml_tensor * src1 = node->src[1]; + const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; + if (src1->type != vec_dot_type) { + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); + } + const int n_as = src0->ne[2]; + cur += GGML_PAD(cur, sizeof(int64_t)); // align + cur += n_as * sizeof(int64_t); // matrix_row_counts + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows + } break; + case GGML_OP_OUT_PROD: + { + if (ggml_is_quantized(node->src[0]->type)) { + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; + } + } break; + case GGML_OP_SOFT_MAX: + case GGML_OP_ROPE: + { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_ACC: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; - } - } break; - case GGML_OP_COUNT_EQUAL: - { - cur = ggml_type_size(node->type)*n_tasks; - } break; - case GGML_OP_MUL_MAT: - { -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) { - cur = ggml_backend_amx_desired_wsize(node); - } -#endif - const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; + } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(node->src[0]->ne[3] == 1); + GGML_ASSERT(node->src[1]->ne[2] == 1); + GGML_ASSERT(node->src[1]->ne[3] == 1); - if (node->src[1]->type != vec_dot_type) { - size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); - cur = MAX(cur, cur2); - } - } break; - case GGML_OP_MUL_MAT_ID: - { - cur = 0; - const struct ggml_tensor * src0 = node->src[0]; - const struct ggml_tensor * src1 = node->src[1]; - const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type; - if (src1->type != vec_dot_type) { - cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); - } - const int n_as = src0->ne[2]; - cur += GGML_PAD(cur, sizeof(int64_t)); // align - cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows - } break; - case GGML_OP_OUT_PROD: - { - if (ggml_is_quantized(node->src[0]->type)) { - cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; - } - } break; - case GGML_OP_SOFT_MAX: - case GGML_OP_ROPE: - { - cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; - } break; - case GGML_OP_CONV_TRANSPOSE_1D: - { - GGML_ASSERT(node->src[0]->ne[3] == 1); - GGML_ASSERT(node->src[1]->ne[2] == 1); - GGML_ASSERT(node->src[1]->ne[3] == 1); + const int64_t ne00 = node->src[0]->ne[0]; // K + const int64_t ne01 = node->src[0]->ne[1]; // Cout + const int64_t ne02 = node->src[0]->ne[2]; // Cin + const int64_t ne10 = node->src[1]->ne[0]; // L + const int64_t ne11 = node->src[1]->ne[1]; // Cin - const int64_t ne00 = node->src[0]->ne[0]; // K - const int64_t ne01 = node->src[0]->ne[1]; // Cout - const int64_t ne02 = node->src[0]->ne[2]; // Cin + if ((node->src[0]->type == GGML_TYPE_F16 || + node->src[0]->type == GGML_TYPE_BF16) && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; + cur += sizeof(ggml_fp16_t)*ne10*ne11; + } else if (node->src[0]->type == GGML_TYPE_F32 && + node->src[1]->type == GGML_TYPE_F32) { + cur += sizeof(float)*ne00*ne01*ne02; + cur += sizeof(float)*ne10*ne11; + } else { + GGML_ABORT("fatal error"); + } + } break; + case GGML_OP_CONV_TRANSPOSE_2D: + { + const int64_t ne00 = node->src[0]->ne[0]; // W + const int64_t ne01 = node->src[0]->ne[1]; // H + const int64_t ne02 = node->src[0]->ne[2]; // Channels Out + const int64_t ne03 = node->src[0]->ne[3]; // Channels In - const int64_t ne10 = node->src[1]->ne[0]; // L - const int64_t ne11 = node->src[1]->ne[1]; // Cin + const int64_t ne10 = node->src[1]->ne[0]; // W + const int64_t ne11 = node->src[1]->ne[1]; // H + const int64_t ne12 = node->src[1]->ne[2]; // Channels In - if ((node->src[0]->type == GGML_TYPE_F16 || - node->src[0]->type == GGML_TYPE_BF16) && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; - cur += sizeof(ggml_fp16_t)*ne10*ne11; - } else if (node->src[0]->type == GGML_TYPE_F32 && - node->src[1]->type == GGML_TYPE_F32) { - cur += sizeof(float)*ne00*ne01*ne02; - cur += sizeof(float)*ne10*ne11; - } else { + cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; + cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + const int64_t ne00 = node->src[0]->ne[0]; // D + + cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + const int64_t D = node->src[0]->ne[0]; + const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src[1]->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } + } break; + + case GGML_OP_CROSS_ENTROPY_LOSS: + { + cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); + } break; + case GGML_OP_COUNT: + { GGML_ABORT("fatal error"); } - } break; - case GGML_OP_CONV_TRANSPOSE_2D: - { - const int64_t ne00 = node->src[0]->ne[0]; // W - const int64_t ne01 = node->src[0]->ne[1]; // H - const int64_t ne02 = node->src[0]->ne[2]; // Channels Out - const int64_t ne03 = node->src[0]->ne[3]; // Channels In - - const int64_t ne10 = node->src[1]->ne[0]; // W - const int64_t ne11 = node->src[1]->ne[1]; // H - const int64_t ne12 = node->src[1]->ne[2]; // Channels In - - cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03; - cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12; - } break; - case GGML_OP_FLASH_ATTN_EXT: - { - const int64_t ne00 = node->src[0]->ne[0]; // D - - cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - const int64_t D = node->src[0]->ne[0]; - const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } else if (node->src[1]->type == GGML_TYPE_BF16) { - cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 - } - } break; - - case GGML_OP_CROSS_ENTROPY_LOSS: - { - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); - } - default: - break; + default: + break; + } } work_size = MAX(work_size, cur); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index d3b4bdb96..c390957af 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -2,12 +2,18 @@ #include "ggml-backend-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-aarch64.h" +#include "ggml-cpu-traits.h" #include "ggml-impl.h" #include "amx/amx.h" + #include #include #include +#ifdef GGML_USE_CPU_HBM +#include "ggml-cpu-hbm.h" +#endif + #if defined(__APPLE__) #include #include @@ -23,115 +29,7 @@ // ggml-backend interface -#ifdef GGML_USE_CPU_HBM - -// buffer type HBM - -#include - -static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_HBM"; - - GGML_UNUSED(buft); -} - -static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { - hbw_free(buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr; - int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size); - if (result != 0) { - GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size); - return NULL; - } - - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, - }, - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type_hbm; -} -#endif - -// buffer type AARCH64 - -static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra; - - ggml_aarch64_repack_tensor(tensor, repack_type, data, size); - - GGML_UNUSED(buffer); -} - -static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "CPU_AARCH64"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - - if (buffer == NULL) { - return NULL; - } - - buffer->buft = buft; - buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor; - buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) { - static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_cpu_buffer_type_aarch64; -} - -bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) { - return buft == ggml_backend_cpu_aarch64_buffer_type(); -} - -static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) { +std::vector& ggml_backend_cpu_get_extra_buffers_type() { static std::vector bufts = []() { std::vector bufts; @@ -152,11 +50,22 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen return bufts; }(); - return bufts.data(); + return bufts; +} + +static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) { + return ggml_backend_cpu_get_extra_buffers_type().data(); GGML_UNUSED(device); } +static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) { + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra && extra == buft) return true; + } + return false; +} + // CPU backend - backend (stream) struct ggml_backend_cpu_context { @@ -465,25 +374,19 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st return true; } - if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { - if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { - return false; + // extra_buffer_op? + for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) { + if (extra) { + auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context; + if (buf_extra && buf_extra->supports_op(dev, op)) { + return true; + } } } -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { - return ggml_backend_amx_device_supports_op(op); - } - for (int i = 1; i < GGML_MAX_SRC; i++) { - if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) { - return false; - } - } -#endif - - for (int i = 1; i < GGML_MAX_SRC; i++) { - if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { + // the other case need host buffer. + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && !ggml_backend_buft_is_host(op->src[i]->buffer->buft)) { return false; } } @@ -506,19 +409,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st default: return true; } - - GGML_UNUSED(dev); } static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); - -#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) - supported = supported || ggml_backend_amx_buft_is_amx(buft); -#endif - - return supported; - + return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_is_extra_buffer_type(buft); GGML_UNUSED(dev); } @@ -666,10 +560,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) { if (strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_cpu_set_n_threads; + ggml_backend_set_n_threads_t fct = ggml_backend_cpu_set_n_threads; + return (void *)fct; } if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { - return (void *)ggml_backend_cpu_get_extra_bufts; + ggml_backend_dev_get_extra_bufts_t fct = ggml_backend_cpu_device_get_extra_buffers_type; + return (void *)fct; } if (strcmp(name, "ggml_backend_get_features") == 0) { return (void *)ggml_backend_cpu_get_features; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d6e4bfdd0..15fcb2a65 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3210,7 +3210,7 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con static const ggml_backend_reg_i ggml_backend_cuda_reg_interface = { /* .get_name = */ ggml_backend_cuda_reg_get_name, /* .get_device_count = */ ggml_backend_cuda_reg_get_device_count, - /* .get_device_get = */ ggml_backend_cuda_reg_get_device, + /* .get_device = */ ggml_backend_cuda_reg_get_device, /* .get_proc_address = */ ggml_backend_cuda_reg_get_proc_address, }; diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7301a9c6c..7918388ae 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -5220,15 +5220,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); } break; - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4); - } break; - case GGML_TYPE_Q4_0_8_8: - { - VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8); - } break; case GGML_TYPE_I8: case GGML_TYPE_I16: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 135efb521..ae3baedc7 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4630,7 +4630,7 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = { /* .get_name = */ ggml_backend_sycl_reg_get_name, /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count, - /* .get_device_get = */ ggml_backend_sycl_reg_get_device, + /* .get_device = */ ggml_backend_sycl_reg_get_device, /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address, }; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2c338dee5..058941c7a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -8,7 +8,10 @@ // FIXME: required here for quantization functions #include "ggml-quants.h" -#include "ggml-aarch64.h" + +#ifdef GGML_USE_CPU_HBM +#include +#endif #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -788,32 +791,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref, }, - [GGML_TYPE_Q4_0_4_4] = { - .type_name = "q4_0_4x4", - .blck_size = QK4_0, - .blck_size_interleave = 4, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [31] = { // GGML_TYPE_Q4_0_4_4 + .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, - [GGML_TYPE_Q4_0_4_8] = { - .type_name = "q4_0_4x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [32] = { // GGML_TYPE_Q4_0_4_8 + .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, - [GGML_TYPE_Q4_0_8_8] = { - .type_name = "q4_0_8x8", - .blck_size = QK4_0, - .blck_size_interleave = 8, - .type_size = sizeof(block_q4_0), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [33] = { // GGML_TYPE_Q4_0_8_8 + .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, [GGML_TYPE_TQ1_0] = { .type_name = "tq1_0", @@ -831,14 +825,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_tq2_0, .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref, }, - [GGML_TYPE_IQ4_NL_4_4] = { - .type_name = "iq4_nl_4x4", - .blck_size = QK4_NL, - .blck_size_interleave = 4, - .type_size = sizeof(block_iq4_nl), - .is_quantized = true, - .to_float = NULL, - .from_float_ref = NULL, + [36] = { // GGML_TYPE_IQ4_NL_4_4 + .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [37] = { // GGML_TYPE_IQ4_NL_4_8 + .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, + }, + [38] = { // GGML_TYPE_IQ4_NL_8_8 + .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, }; @@ -1270,9 +1273,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break; - case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break; - case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -6304,9 +6304,6 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; - case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); @@ -6838,7 +6835,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[2] * (int64_t) info->ne[3]; - if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { + if (ggml_blck_size(info->type) == 0 ) { + // this tensor type support have been removed: + fprintf(stderr, "%s: tensor '%s' of type %d: %s\n", + __func__, info->name.data, (int) info->type, ggml_type_name(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + if (ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 703199fcb..66247b803 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1432,9 +1432,6 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q4_0_4_4 = 31 - Q4_0_4_8 = 32 - Q4_0_8_8 = 33 TQ1_0 = 34 TQ2_0 = 35 @@ -1478,9 +1475,9 @@ class LlamaFileType(IntEnum): MOSTLY_IQ4_XS = 30 # except 1d tensors MOSTLY_IQ1_M = 31 # except 1d tensors MOSTLY_BF16 = 32 # except 1d tensors - MOSTLY_Q4_0_4_4 = 33 # except 1d tensors - MOSTLY_Q4_0_4_8 = 34 # except 1d tensors - MOSTLY_Q4_0_8_8 = 35 # except 1d tensors + # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack MOSTLY_TQ1_0 = 36 # except 1d tensors MOSTLY_TQ2_0 = 37 # except 1d tensors @@ -1556,9 +1553,6 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16), - GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16), - GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16), GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), } diff --git a/include/llama.h b/include/llama.h index d121354c1..36945cde3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -172,9 +172,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack + //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors diff --git a/src/llama.cpp b/src/llama.cpp index ba4a9dfcf..cae3f76ad 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4578,9 +4578,6 @@ struct llama_model_loader { case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break; - case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break; - case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -5344,9 +5341,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; default: return "unknown, may not work"; } @@ -18367,10 +18361,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { - new_type = GGML_TYPE_Q4_0; - } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } @@ -18693,9 +18683,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; - case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -19034,14 +19021,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - int chunk_size_multiplier = 1; - if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { - if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0; - else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0; - if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8; - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; - } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); @@ -19054,8 +19033,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int64_t nrows = tensor->ne[1]; static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) * - chunk_size_multiplier; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; From ce4a7b849388b67d45ad420bdd82d5efcd55647a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 18:02:05 +0200 Subject: [PATCH 23/99] server : various fixes (#10704) * server : various fixes ggml-ci * server : show curent seed in slot_params ggml-ci * fix /slots endpoint * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov * server : reflect endpoint response changes in the readme ggml-ci --------- Co-authored-by: Xuan Son Nguyen Co-authored-by: Xuan Son Nguyen --- examples/server/CMakeLists.txt | 8 -- examples/server/README.md | 187 ++++++++++++++++++++++++--------- examples/server/server.cpp | 76 +++++++------- examples/server/utils.hpp | 4 +- 4 files changed, 178 insertions(+), 97 deletions(-) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 0035859a6..63fca1d59 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -34,14 +34,6 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) -# clean up generated files in pre-build step -foreach(asset ${PUBLIC_ASSETS}) - set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND "${CMAKE_COMMAND}" -E remove -f "${output}" - ) -endforeach() - target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) diff --git a/examples/server/README.md b/examples/server/README.md index 8dbed2626..0bab40a82 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -618,9 +618,76 @@ This endpoint is public (no API key check). By default, it is read-only. To make ```json { - "default_generation_settings": { ... }, + "default_generation_settings": { + "id": 0, + "id_task": -1, + "n_ctx": 1024, + "speculative": false, + "is_processing": false, + "params": { + "n_predict": -1, + "seed": 4294967295, + "temperature": 0.800000011920929, + "dynatemp_range": 0.0, + "dynatemp_exponent": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "min_p": 0.05000000074505806, + "xtc_probability": 0.0, + "xtc_threshold": 0.10000000149011612, + "typical_p": 1.0, + "repeat_last_n": 64, + "repeat_penalty": 1.0, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "dry_multiplier": 0.0, + "dry_base": 1.75, + "dry_allowed_length": 2, + "dry_penalty_last_n": -1, + "dry_sequence_breakers": [ + "\n", + ":", + "\"", + "*" + ], + "mirostat": 0, + "mirostat_tau": 5.0, + "mirostat_eta": 0.10000000149011612, + "penalize_nl": false, + "stop": [], + "max_tokens": -1, + "n_keep": 0, + "n_discard": 0, + "ignore_eos": false, + "stream": true, + "n_probs": 0, + "min_keep": 0, + "grammar": "", + "samplers": [ + "dry", + "top_k", + "typ_p", + "top_p", + "min_p", + "xtc", + "temperature" + ], + "speculative.n_max": 16, + "speculative.n_min": 5, + "speculative.p_min": 0.8999999761581421, + "timings_per_token": false + }, + "prompt": "", + "next_token": { + "has_next_token": true, + "has_new_line": false, + "n_remain": -1, + "n_decoded": 0, + "stopping_word": "" + } + }, "total_slots": 1, - "chat_template": "" + "chat_template": "..." } ``` @@ -739,56 +806,74 @@ Example: ```json [ - { - "dynatemp_exponent": 1.0, - "dynatemp_range": 0.0, - "frequency_penalty": 0.0, - "grammar": "", - "id": 0, - "ignore_eos": false, - "is_processing": false, - "logit_bias": [], - "min_p": 0.05000000074505806, - "mirostat": 0, - "mirostat_eta": 0.10000000149011612, - "mirostat_tau": 5.0, - "model": "llama-2-7b-32k-instruct.Q2_K.gguf", - "n_ctx": 2048, - "n_keep": 0, - "n_predict": 100000, - "n_probs": 0, - "next_token": { - "has_next_token": true, - "n_remain": -1, - "n_decoded": 0, - "stopped_eos": false, - "stopped_limit": false, - "stopped_word": false, - "stopping_word": "" - }, - "penalize_nl": true, - "presence_penalty": 0.0, - "prompt": "Say hello to llama.cpp", - "repeat_last_n": 64, - "repeat_penalty": 1.100000023841858, - "samplers": [ - "top_k", - "typical_p", - "top_p", - "min_p", - "temperature" - ], - "seed": 42, - "stop": [ - "\n" - ], - "stream": false, - "task_id": 0, - "temperature": 0.0, - "top_k": 40, - "top_p": 0.949999988079071, - "typical_p": 1.0 + { + "id": 0, + "id_task": -1, + "n_ctx": 1024, + "speculative": false, + "is_processing": false, + "params": { + "n_predict": -1, + "seed": 4294967295, + "temperature": 0.800000011920929, + "dynatemp_range": 0.0, + "dynatemp_exponent": 1.0, + "top_k": 40, + "top_p": 0.949999988079071, + "min_p": 0.05000000074505806, + "xtc_probability": 0.0, + "xtc_threshold": 0.10000000149011612, + "typical_p": 1.0, + "repeat_last_n": 64, + "repeat_penalty": 1.0, + "presence_penalty": 0.0, + "frequency_penalty": 0.0, + "dry_multiplier": 0.0, + "dry_base": 1.75, + "dry_allowed_length": 2, + "dry_penalty_last_n": -1, + "dry_sequence_breakers": [ + "\n", + ":", + "\"", + "*" + ], + "mirostat": 0, + "mirostat_tau": 5.0, + "mirostat_eta": 0.10000000149011612, + "penalize_nl": false, + "stop": [], + "max_tokens": -1, + "n_keep": 0, + "n_discard": 0, + "ignore_eos": false, + "stream": true, + "n_probs": 0, + "min_keep": 0, + "grammar": "", + "samplers": [ + "dry", + "top_k", + "typ_p", + "top_p", + "min_p", + "xtc", + "temperature" + ], + "speculative.n_max": 16, + "speculative.n_min": 5, + "speculative.p_min": 0.8999999761581421, + "timings_per_token": false + }, + "prompt": "", + "next_token": { + "has_next_token": true, + "has_new_line": false, + "n_remain": -1, + "n_decoded": 0, + "stopping_word": "" } + } ] ``` diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d57a296a2..1ce8fbae2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -122,11 +122,6 @@ struct slot_params { struct common_params_sampling sampling; struct common_params_speculative speculative; - // params only used in to_json() - int32_t n_ctx; - uint32_t seed_cur; - bool can_speculative; - // OAI-compat fields bool verbose = false; bool oaicompat = false; @@ -134,7 +129,7 @@ struct slot_params { std::string oaicompat_model; std::string oaicompat_cmpl_id; - json to_json() { + json to_json() const { std::vector samplers; samplers.reserve(sampling.samplers.size()); for (const auto & sampler : sampling.samplers) { @@ -142,8 +137,8 @@ struct slot_params { } return json { - {"n_ctx", n_ctx}, {"n_predict", n_predict}, // Server configured n_predict + {"seed", sampling.seed}, {"temperature", sampling.temp}, {"dynatemp_range", sampling.dynatemp_range}, {"dynatemp_exponent", sampling.dynatemp_exponent}, @@ -177,7 +172,6 @@ struct slot_params { {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, {"samplers", samplers}, - {"speculative", can_speculative}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, @@ -483,12 +477,6 @@ struct server_task_result_cmpl_partial : server_task_result { return std::vector({initial_ret, second_ret}); } } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - choices = json::array({json{ {"finish_reason", nullptr}, {"index", 0}, @@ -722,6 +710,7 @@ struct server_slot { llama_batch batch_spec = {}; + llama_context * ctx = nullptr; llama_context * ctx_dft = nullptr; common_speculative * spec = nullptr; @@ -906,6 +895,27 @@ struct server_slot { t_token_generation, n_decoded, t_gen, n_gen_second, t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded); } + + json to_json() const { + return json { + {"id", id}, + {"id_task", id_task}, + {"n_ctx", n_ctx}, + {"speculative", can_speculate()}, + {"is_processing", is_processing()}, + {"params", params.to_json()}, + {"prompt", common_detokenize(ctx, prompt_tokens)}, + {"next_token", + { + {"has_next_token", has_next_token}, + {"has_new_line", has_new_line}, + {"n_remain", n_remaining}, + {"n_decoded", n_decoded}, + {"stopping_word", stopping_word}, + } + }, + }; + } }; struct server_metrics { @@ -1338,6 +1348,7 @@ struct server_context { server_slot slot; slot.id = i; + slot.ctx = ctx; slot.n_ctx = n_ctx_slot; slot.n_predict = params_base.n_predict; @@ -1370,8 +1381,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = slots[0].params.to_json(); - default_generation_settings_for_props["seed"] = -1; + default_generation_settings_for_props = slots[0].to_json(); // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) @@ -1848,17 +1858,18 @@ struct server_context { queue_results.send(std::move(res)); } - void send_partial_response(server_slot & slot, completion_token_output tkn) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->content = tkn.text_to_send; + + res->id = slot.id_task; + res->index = slot.index; + res->content = tkn.text_to_send; res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->stop = slot.stop; + res->stop = slot.stop; res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; @@ -1869,6 +1880,7 @@ struct server_context { // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); + const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); @@ -1891,7 +1903,8 @@ struct server_context { void send_final_response(server_slot & slot) { if (slot.params.stream) { // if in stream mode, send the last partial response - return send_partial_response(slot, {0, "", {}}); + send_partial_response(slot, {0, "", {}}); + return; } auto res = std::make_unique(); @@ -2012,6 +2025,7 @@ struct server_context { std::vector tasks; auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); + server_task task; task.id = queue_tasks.get_new_id(); task.inf_type = inf_type; @@ -2205,18 +2219,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = slot.params.to_json(); - slot_data["id"] = slot.id; - slot_data["id_task"] = slot.id_task; - slot_data["is_processing"] = slot.is_processing(); - slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens); - slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"has_new_line", slot.has_new_line}, - {"n_remain", slot.n_remaining}, - {"n_decoded", slot.n_decoded}, - {"stopping_word", slot.stopping_word}, - }; + json slot_data = slot.to_json(); if (slot.is_processing()) { n_processing_slots++; @@ -2230,6 +2233,7 @@ struct server_context { auto res = std::make_unique(); res->id = task.id; + res->slots_data = std::move(slots_data); res->n_idle_slots = n_idle_slots; res->n_processing_slots = n_processing_slots; res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); @@ -3003,11 +3007,11 @@ int main(int argc, char ** argv) { res.status = 200; }; - svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { + svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { std::string message; try { std::rethrow_exception(ep); - } catch (std::exception & e) { + } catch (const std::exception & e) { message = e.what(); } catch (...) { message = "Unknown Exception"; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a96116ac3..c9fe7d966 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -327,12 +327,12 @@ static std::string llama_get_chat_template(const struct llama_model * model) { std::string template_key = "tokenizer.chat_template"; // call with NULL buffer to get the total size of the string int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0); - if (res < 0) { + if (res < 2) { return ""; } else { std::vector model_template(res, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - return std::string(model_template.data(), model_template.size()); + return std::string(model_template.data(), model_template.size() - 1); } } From d9c3ba2b7749c00df477599aa141a98b4521aa2c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 7 Dec 2024 18:38:15 +0200 Subject: [PATCH 24/99] ggml : disable iq4_nl interleave size 8 (#10709) ggml-ci --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 30 +++++++++++++++----------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 9b9e3c92a..a51d1a6c5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -3748,16 +3748,18 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s const int end = QK4_NL * 2 / blck_size_interleave; - if (blck_size_interleave == 8) { - for (int i = 0; i < end; ++i) { - int src_id = i % 4; - int src_offset = (i / 4) * blck_size_interleave; - int dst_offset = i * blck_size_interleave; + // TODO: this branch seems wrong + //if (blck_size_interleave == 8) { + // for (int i = 0; i < end; ++i) { + // int src_id = i % 4; + // int src_offset = (i / 4) * blck_size_interleave; + // int dst_offset = i * blck_size_interleave; - // Using memcpy to avoid unaligned memory accesses - memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); - } - } else if (blck_size_interleave == 4) { + // // Using memcpy to avoid unaligned memory accesses + // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t)); + // } + //} else + if (blck_size_interleave == 4) { for (int i = 0; i < end; ++i) { int src_id = i % 4; int src_offset = (i / 4) * blck_size_interleave; @@ -3774,7 +3776,8 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); - GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + //GGML_ASSERT(interleave_block == 4 || interleave_block == 8); + GGML_ASSERT(interleave_block == 4); block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; const block_iq4_nl * src = (const block_iq4_nl *)data; @@ -3825,9 +3828,10 @@ template <> int repack(struct ggml_tensor * t, const void * return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); } -template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { - return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); -} +// TODO: needs to be revisited +//template <> int repack(struct ggml_tensor * t, const void * data, size_t data_size) { +// return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size); +//} // gemv template From 3573fa8e7b7f0865638b52b4e9b4d2006f0558a2 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 7 Dec 2024 20:21:09 +0100 Subject: [PATCH 25/99] server : (refactor) no more json in server_task input (#10691) * server : (refactor) no more json in server_task input * add test for slots endpoint * add tests for /props and /slots * remove task inf_type * fix CI by adding safe_json_to_str * add "model_path" to /props * update readme --- examples/server/README.md | 2 + examples/server/server.cpp | 744 +++++++++--------- examples/server/tests/unit/test_basic.py | 30 + .../server/tests/unit/test_chat_completion.py | 5 + examples/server/tests/utils.py | 10 +- examples/server/utils.hpp | 20 +- 6 files changed, 427 insertions(+), 384 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 0bab40a82..117c52d3f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -687,12 +687,14 @@ This endpoint is public (no API key check). By default, it is read-only. To make } }, "total_slots": 1, + "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", "chat_template": "..." } ``` - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) +- `model_path` - the path to model file (same with `-m` argument) - `chat_template` - the model's original Jinja2 prompt template ### POST `/props`: Change server global properties. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1ce8fbae2..1c21e55aa 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -54,7 +54,10 @@ enum server_state { }; enum server_task_type { - SERVER_TASK_TYPE_INFERENCE, + SERVER_TASK_TYPE_COMPLETION, + SERVER_TASK_TYPE_EMBEDDING, + SERVER_TASK_TYPE_RERANK, + SERVER_TASK_TYPE_INFILL, SERVER_TASK_TYPE_CANCEL, SERVER_TASK_TYPE_NEXT_RESPONSE, SERVER_TASK_TYPE_METRICS, @@ -64,13 +67,6 @@ enum server_task_type { SERVER_TASK_TYPE_SET_LORA, }; -enum server_task_inf_type { - SERVER_TASK_INF_TYPE_COMPLETION, - SERVER_TASK_INF_TYPE_EMBEDDING, - SERVER_TASK_INF_TYPE_RERANK, - SERVER_TASK_INF_TYPE_INFILL, -}; - // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { ERROR_TYPE_INVALID_REQUEST, @@ -82,28 +78,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -struct server_task { - int id = -1; // to be filled by server_queue - int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL - - llama_tokens prompt_tokens; - server_task_type type; - - // TODO @ngxson : we should get rid of json type here - json data; - - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - struct slot_params { bool stream = true; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt @@ -118,6 +92,7 @@ struct slot_params { std::vector antiprompt; bool timings_per_token = false; + bool ignore_eos = false; struct common_params_sampling sampling; struct common_params_speculative speculative; @@ -167,7 +142,7 @@ struct slot_params { {"n_discard", n_discard}, {"ignore_eos", sampling.ignore_eos}, {"stream", stream}, - //{"logit_bias", sampling.logit_bias}, + {"logit_bias", format_logit_bias(sampling.logit_bias)}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, @@ -180,6 +155,209 @@ struct slot_params { } }; +struct server_task { + int id = -1; // to be filled by server_queue + int index = -1; // used when there are multiple prompts (batch request) + + server_task_type type; + + // used by SERVER_TASK_TYPE_CANCEL + int id_target = -1; + + // used by SERVER_TASK_TYPE_INFERENCE + slot_params params; + llama_tokens prompt_tokens; + int id_selected_slot = -1; + + // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE + struct slot_action { + int slot_id; + std::string filename; + std::string filepath; + }; + slot_action slot_action; + + // used by SERVER_TASK_TYPE_METRICS + bool metrics_reset_bucket = false; + + server_task(server_task_type type) : type(type) {} + + static slot_params params_from_json_cmpl( + const llama_model * model, + const common_params & params_base, + const json & data) { + slot_params params; + + // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) + slot_params defaults; + defaults.sampling = params_base.sampling; + defaults.speculative = params_base.speculative; + + // enabling this will output extra debug information in the HTTP responses from the server + params.verbose = params_base.verbosity > 9; + params.timings_per_token = json_value(data, "timings_per_token", false); + + params.stream = json_value(data, "stream", false); + params.cache_prompt = json_value(data, "cache_prompt", true); + params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); + params.n_indent = json_value(data, "n_indent", defaults.n_indent); + params.n_keep = json_value(data, "n_keep", defaults.n_keep); + params.n_discard = json_value(data, "n_discard", defaults.n_discard); + //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement + params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + + params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); + params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); + params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); + params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); + params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); + params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); + params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); + params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); + params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); + params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); + params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); + params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); + params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); + params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); + params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); + params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); + params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); + params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); + params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); + params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); + params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); + params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); + params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); + params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + + params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); + params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); + params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); + + params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min); + params.speculative.n_min = std::max(params.speculative.n_min, 2); + params.speculative.n_max = std::max(params.speculative.n_max, 0); + + if (params.sampling.dry_base < 1.0f) { + params.sampling.dry_base = defaults.sampling.dry_base; + } + + // sequence breakers for DRY + { + // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format + // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 + + if (data.contains("dry_sequence_breakers")) { + params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (params.sampling.dry_sequence_breakers.empty()) { + throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); + } + } + } + + // process "json_schema" and "grammar" + if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { + throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both"); + } + if (data.contains("json_schema") && !data.contains("grammar")) { + try { + auto schema = json_value(data, "json_schema", json::object()); + params.sampling.grammar = json_schema_to_grammar(schema); + } catch (const std::exception & e) { + throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); + } + } else { + params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); + } + + { + params.sampling.logit_bias.clear(); + params.ignore_eos = json_value(data, "ignore_eos", false); + + const auto & logit_bias = data.find("logit_bias"); + if (logit_bias != data.end() && logit_bias->is_array()) { + const int n_vocab = llama_n_vocab(model); + for (const auto & el : *logit_bias) { + // TODO: we may want to throw errors here, in case "el" is incorrect + if (el.is_array() && el.size() == 2) { + float bias; + if (el[1].is_number()) { + bias = el[1].get(); + } else if (el[1].is_boolean() && !el[1].get()) { + bias = -INFINITY; + } else { + continue; + } + + if (el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } else if (el[0].is_string()) { + auto toks = common_tokenize(model, el[0].get(), false); + for (auto tok : toks) { + params.sampling.logit_bias.push_back({tok, bias}); + } + } + } + } + } + } + + { + params.antiprompt.clear(); + + const auto & stop = data.find("stop"); + if (stop != data.end() && stop->is_array()) { + for (const auto & word : *stop) { + if (!word.empty()) { + params.antiprompt.push_back(word); + } + } + } + } + + { + const auto & samplers = data.find("samplers"); + if (samplers != data.end()) { + if (samplers->is_array()) { + std::vector sampler_names; + for (const auto & name : *samplers) { + if (name.is_string()) { + sampler_names.emplace_back(name); + } + } + params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); + } else if (samplers->is_string()){ + std::string sampler_string; + for (const auto & name : *samplers) { + sampler_string += name; + } + params.sampling.samplers = common_sampler_types_from_chars(sampler_string); + } + } else { + params.sampling.samplers = defaults.sampling.samplers; + } + } + + std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; + params.oaicompat_model = json_value(data, "model", model_name); + + return params; + } + + // utility function + static std::unordered_set get_list_id(const std::vector & tasks) { + std::unordered_set ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + } + return ids; + } +}; + struct result_timings { int32_t prompt_n = -1; double prompt_ms; @@ -191,7 +369,7 @@ struct result_timings { double predicted_per_token_ms; double predicted_per_second; - json to_json() { + json to_json() const { return { {"prompt_n", prompt_n}, {"prompt_ms", prompt_ms}, @@ -623,7 +801,8 @@ struct server_task_result_metrics : server_task_result { uint64_t n_decode_total = 0; uint64_t n_busy_slots_total = 0; - // TODO: get rid of this json object and use to_json() instead + // while we can also use std::vector this requires copying the slot object which can be quite messy + // therefore, we use json to temporarily store the slot.to_json() result json slots_data = json::array(); virtual json to_json() override { @@ -708,6 +887,9 @@ struct server_slot { int id; int id_task = -1; + // only used for completion/embedding/infill/rerank + server_task_type task_type = SERVER_TASK_TYPE_COMPLETION; + llama_batch batch_spec = {}; llama_context * ctx = nullptr; @@ -746,8 +928,6 @@ struct server_slot { llama_tokens cache_tokens; std::vector generated_token_probs; - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - bool has_next_token = true; bool has_new_line = false; bool truncated = false; @@ -787,11 +967,15 @@ struct server_slot { n_past = 0; n_sent_text = 0; n_sent_token_probs = 0; - inf_type = SERVER_TASK_INF_TYPE_COMPLETION; + task_type = SERVER_TASK_TYPE_COMPLETION; generated_token_probs.clear(); } + bool is_non_causal() const { + return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; + } + bool has_budget(const common_params & global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless @@ -903,6 +1087,7 @@ struct server_slot { {"n_ctx", n_ctx}, {"speculative", can_speculate()}, {"is_processing", is_processing()}, + {"non_causal", is_non_causal()}, {"params", params.to_json()}, {"prompt", common_detokenize(ctx, prompt_tokens)}, {"next_token", @@ -988,9 +1173,7 @@ struct server_queue { // Add a new task to the end of the queue int post(server_task task, bool front = false) { std::unique_lock lock(mutex_tasks); - if (task.id == -1) { - task.id = id++; - } + GGML_ASSERT(task.id != -1); QUE_DBG("new task, id = %d, front = %d\n", task.id, front); if (front) { queue_tasks.push_front(std::move(task)); @@ -1468,104 +1651,14 @@ struct server_context { } bool launch_slot_with_task(server_slot & slot, const server_task & task) { - // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) - slot_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; + slot.reset(); + slot.id_task = task.id; + slot.index = task.index; + slot.task_type = task.type; + slot.params = std::move(task.params); + slot.prompt_tokens = std::move(task.prompt_tokens); - const auto & data = task.data; - - if (data.count("__oaicompat") != 0) { - std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; - slot.params.oaicompat = true; - slot.params.oaicompat_chat = json_value(data, "__oaicompat_chat", false); - slot.params.oaicompat_model = json_value(data, "model", model_name); - slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); - } else { - slot.params.oaicompat = false; - } - - - // enabling this will output extra debug information in the HTTP responses from the server - slot.params.verbose = params_base.verbosity > 9; - slot.params.timings_per_token = json_value(data, "timings_per_token", false); - - slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", true); - slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); - slot.params.n_indent = json_value(data, "n_indent", defaults.n_indent); - slot.params.n_keep = json_value(data, "n_keep", defaults.n_keep); - slot.params.n_discard = json_value(data, "n_discard", defaults.n_discard); - //slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement - slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); - - slot.params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - slot.params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - slot.params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - slot.params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - slot.params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - slot.params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - slot.params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - slot.params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - slot.params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - slot.params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - slot.params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - slot.params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - slot.params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - slot.params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - slot.params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - slot.params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - slot.params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - slot.params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - slot.params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); - slot.params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - slot.params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - slot.params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - - slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); - slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); - slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min); - - slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); - slot.params.speculative.n_min = std::max(slot.params.speculative.n_min, 2); - slot.params.speculative.n_max = std::max(slot.params.speculative.n_max, 0); - - if (slot.params.sampling.dry_base < 1.0f) { - slot.params.sampling.dry_base = defaults.sampling.dry_base; - } - - // sequence breakers for DRY - { - // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format - // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 - - if (data.contains("dry_sequence_breakers")) { - slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (slot.params.sampling.dry_sequence_breakers.empty()) { - send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST); - return false; - } - } - } - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { - send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST); - return false; - } - if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - slot.params.sampling.grammar = json_schema_to_grammar(schema); - } catch (const std::exception & e) { - send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST); - return false; - } - } else { - slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); - } + SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? @@ -1573,78 +1666,8 @@ struct server_context { SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict); } - { - slot.params.sampling.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false) && has_eos_token) { - slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); - } - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - slot.params.sampling.logit_bias.push_back({tok, bias}); - } - } else if (el[0].is_string()) { - auto toks = common_tokenize(model, el[0].get(), false); - for (auto tok : toks) { - slot.params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - } - } - - { - slot.params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - slot.params.antiprompt.push_back(word); - } - } - } - } - - { - const auto & samplers = data.find("samplers"); - if (samplers != data.end()) { - if (samplers->is_array()) { - std::vector sampler_names; - for (const auto & name : *samplers) { - if (name.is_string()) { - sampler_names.emplace_back(name); - } - } - slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false); - } else if (samplers->is_string()){ - std::string sampler_string; - for (const auto & name : *samplers) { - sampler_string += name; - } - slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string); - } - } else { - slot.params.sampling.samplers = defaults.sampling.samplers; - } + if (slot.params.ignore_eos && has_eos_token) { + slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); } { @@ -2020,82 +2043,13 @@ struct server_context { // Functions to create new task(s) and receive result(s) // - // break the input "prompt" into multiple tasks if needed, then format and tokenize the input prompt(s) - std::vector create_tasks_inference(json data, server_task_inf_type inf_type) { - std::vector tasks; - auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) { - SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size()); - - server_task task; - task.id = queue_tasks.get_new_id(); - task.inf_type = inf_type; - task.type = SERVER_TASK_TYPE_INFERENCE; - task.data = task_data; - task.prompt_tokens = std::move(prompt_tokens); - tasks.push_back(std::move(task)); - }; - - static constexpr const char * error_msg = "\"prompt\" must be a string, an array of token ids or an array of prompts"; - if (!data.contains("prompt")) { - throw std::runtime_error(error_msg); - } - - // because llama_tokenize api is thread-safe, we can tokenize the prompt from HTTP thread - bool add_special = inf_type != SERVER_TASK_INF_TYPE_RERANK && inf_type != SERVER_TASK_INF_TYPE_INFILL; - std::vector tokenized_prompts = tokenize_input_prompts(ctx, data.at("prompt"), add_special, true); - switch (inf_type) { - case SERVER_TASK_INF_TYPE_RERANK: - { - // prompts[0] is the question - // the rest are the answers/documents - GGML_ASSERT(tokenized_prompts.size() > 1); - SRV_DBG("creating rerank tasks, n_prompts = %d\n", (int) tokenized_prompts.size() - 1); - for (size_t i = 1; i < tokenized_prompts.size(); i++) { - data["index"] = i - 1; - auto tokens = format_rerank(model, tokenized_prompts[0], tokenized_prompts[i]); - create_task(data, tokens); - } - } break; - case SERVER_TASK_INF_TYPE_INFILL: - { - SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - data["index"] = i; - auto tokens = format_infill( - ctx, - data.at("input_prefix"), - data.at("input_suffix"), - data.at("input_extra"), - params_base.n_batch, - params_base.n_predict, - slots[0].n_ctx, // TODO: there should be a better way - params_base.spm_infill, - tokenized_prompts[i] - ); - create_task(data, tokens); - } - } break; - default: - { - SRV_DBG("creating multi-prompt tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); - for (size_t i = 0; i < tokenized_prompts.size(); i++) { - data["index"] = i; - create_task(data, tokenized_prompts[i]); - } - } - } - - return tasks; - } - void cancel_tasks(const std::unordered_set & id_tasks) { std::vector cancel_tasks; cancel_tasks.reserve(id_tasks.size()); for (const auto & id_task : id_tasks) { SRV_WRN("cancel task, id_task = %d\n", id_task); - server_task task; - task.type = SERVER_TASK_TYPE_CANCEL; + server_task task(SERVER_TASK_TYPE_CANCEL); task.id_target = id_task; cancel_tasks.push_back(task); queue_results.remove_waiting_task_id(id_task); @@ -2104,7 +2058,7 @@ struct server_context { queue_tasks.post(cancel_tasks, true); } - // receive the results from task(s) created by create_tasks_inference + // receive the results from task(s) void receive_multi_results( const std::unordered_set & id_tasks, const std::function&)> & result_handler, @@ -2131,7 +2085,7 @@ struct server_context { result_handler(results); } - // receive the results from task(s) created by create_tasks_inference, in stream mode + // receive the results from task(s), in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const std::function & result_handler, @@ -2166,9 +2120,12 @@ struct server_context { void process_single_task(server_task task) { switch (task.type) { - case SERVER_TASK_TYPE_INFERENCE: + case SERVER_TASK_TYPE_COMPLETION: + case SERVER_TASK_TYPE_INFILL: + case SERVER_TASK_TYPE_EMBEDDING: + case SERVER_TASK_TYPE_RERANK: { - const int id_slot = json_value(task.data, "id_slot", -1); + const int id_slot = task.id_selected_slot; server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); @@ -2185,13 +2142,6 @@ struct server_context { break; } - slot->reset(); - - slot->id_task = task.id; - slot->inf_type = task.inf_type; - slot->index = json_value(task.data, "index", 0); - slot->prompt_tokens = std::move(task.prompt_tokens); - if (!launch_slot_with_task(*slot, task)) { SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id); break; @@ -2255,14 +2205,14 @@ struct server_context { res->n_decode_total = metrics.n_decode_total; res->n_busy_slots_total = metrics.n_busy_slots_total; - if (json_value(task.data, "reset_bucket", false)) { + if (task.metrics_reset_bucket) { metrics.reset_bucket(); } queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2278,8 +2228,8 @@ struct server_context { const size_t token_count = slot->cache_tokens.size(); const int64_t t_start = ggml_time_us(); - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); @@ -2298,7 +2248,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2313,8 +2263,8 @@ struct server_context { const int64_t t_start = ggml_time_us(); - std::string filename = task.data.at("filename"); - std::string filepath = task.data.at("filepath"); + std::string filename = task.slot_action.filename; + std::string filepath = task.slot_action.filepath; slot->cache_tokens.resize(slot->n_ctx); size_t token_count = 0; @@ -2341,7 +2291,7 @@ struct server_context { } break; case SERVER_TASK_TYPE_SLOT_ERASE: { - int id_slot = task.data.at("id_slot"); + int id_slot = task.slot_action.slot_id; server_slot * slot = get_slot_by_id(id_slot); if (slot == nullptr) { send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST); @@ -2400,10 +2350,8 @@ struct server_context { { SRV_DBG("%s", "posting NEXT_RESPONSE\n"); - server_task task; - task.type = SERVER_TASK_TYPE_NEXT_RESPONSE; - task.id_target = -1; - + server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); + task.id = queue_tasks.get_new_id(); queue_tasks.post(task); } @@ -2517,7 +2465,7 @@ struct server_context { continue; } - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.is_non_causal()) { if (slot.n_prompt_tokens > n_ubatch) { slot.release(); send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER); @@ -2632,7 +2580,7 @@ struct server_context { } // non-causal tasks require to fit the entire prompt in the physical batch - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.is_non_causal()) { // cannot fit the prompt in the current batch - will try next iter if (batch.n_tokens + slot.n_prompt_tokens > n_batch) { continue; @@ -2640,10 +2588,7 @@ struct server_context { } // check that we are in the right batch_type, if not defer the slot - const bool slot_type = - slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING || - slot.inf_type == SERVER_TASK_INF_TYPE_RERANK ? 1 : 0; - + int slot_type = slot.is_non_causal(); if (batch_type == -1) { batch_type = slot_type; } else if (batch_type != slot_type) { @@ -2760,7 +2705,7 @@ struct server_context { } if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.inf_type == SERVER_TASK_INF_TYPE_EMBEDDING) { + if (slot.task_type == SERVER_TASK_TYPE_EMBEDDING) { // prompt evaluated for embedding send_embedding(slot, batch_view); slot.release(); @@ -2768,7 +2713,7 @@ struct server_context { continue; // continue loop of slots } - if (slot.inf_type == SERVER_TASK_INF_TYPE_RERANK) { + if (slot.task_type == SERVER_TASK_TYPE_RERANK) { send_rerank(slot, batch_view); slot.release(); slot.i_batch = -1; @@ -2998,12 +2943,12 @@ int main(int argc, char ** argv) { auto res_error = [](httplib::Response & res, const json & error_data) { json final_response {{"error", error_data}}; - res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); + res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON); res.status = json_value(error_data, "code", 500); }; auto res_ok = [](httplib::Response & res, const json & data) { - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON); + res.set_content(safe_json_to_str(data), MIMETYPE_JSON); res.status = 200; }; @@ -3140,10 +3085,8 @@ int main(int argc, char ** argv) { } // request slots data using task queue - server_task task; + server_task task(SERVER_TASK_TYPE_METRICS); task.id = ctx_server.queue_tasks.get_new_id(); - task.type = SERVER_TASK_TYPE_METRICS; - ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task, true); // high-priority task @@ -3178,11 +3121,9 @@ int main(int argc, char ** argv) { } // request slots data using task queue - server_task task; + server_task task(SERVER_TASK_TYPE_METRICS); task.id = ctx_server.queue_tasks.get_new_id(); - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - task.data.push_back({{"reset_bucket", true}}); + task.metrics_reset_bucket = true; ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task, true); // high-priority task @@ -3286,19 +3227,17 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_SAVE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_SAVE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3317,19 +3256,17 @@ int main(int argc, char ** argv) { } std::string filepath = params.slot_save_path + filename; - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_RESTORE; - task.data = { - { "id_slot", id_slot }, - { "filename", filename }, - { "filepath", filepath }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_RESTORE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; + task.slot_action.filename = filename; + task.slot_action.filepath = filepath; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3341,17 +3278,15 @@ int main(int argc, char ** argv) { }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { - server_task task; - task.type = SERVER_TASK_TYPE_SLOT_ERASE; - task.data = { - { "id_slot", id_slot }, - }; + server_task task(SERVER_TASK_TYPE_SLOT_ERASE); + task.id = ctx_server.queue_tasks.get_new_id(); + task.slot_action.slot_id = id_slot; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); @@ -3392,9 +3327,11 @@ int main(int argc, char ** argv) { }; const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { + // this endpoint is publicly available, please only return what is safe to be exposed json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, + { "model_path", ctx_server.params_base.model }, { "chat_template", llama_get_chat_template(ctx_server.model) }, }; @@ -3417,17 +3354,47 @@ int main(int argc, char ** argv) { // handle completion-like requests (completion, chat, infill) // we can optionally provide a custom format for partial results and final results const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( - server_task_inf_type inf_type, + server_task_type type, json & data, httplib::Response & res, - bool oai_compat = false) { + bool oaicompat = false, + bool oaicompat_chat = false) { + GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); + if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } - data["completion_id"] = gen_chatcmplid(); - std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); + auto completion_id = gen_chatcmplid(); + std::vector tasks; + + try { + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); + tasks.reserve(tokenized_prompts.size()); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(type); + + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + + task.prompt_tokens = std::move(tokenized_prompts[i]); + task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data); + task.id_selected_slot = json_value(data, "id_slot", -1); + + // OAI-compat + task.params.oaicompat = oaicompat; + task.params.oaicompat_chat = oaicompat_chat; + task.params.oaicompat_cmpl_id = completion_id; + // oaicompat_model is already populated by params_from_json_cmpl + + tasks.push_back(task); + } + } catch (const std::exception & e) { + res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST)); + return; + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3453,7 +3420,7 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { + const auto chunked_content_provider = [task_ids, &ctx_server, oaicompat](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { json res_json = result->to_json(); if (res_json.is_array()) { @@ -3469,7 +3436,7 @@ int main(int argc, char ** argv) { }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); - if (oai_compat) { + if (oaicompat) { static const std::string ev_done = "data: [DONE]\n\n"; sink.write(ev_done.data(), ev_done.size()); } @@ -3487,7 +3454,12 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); + return handle_completions_generic( + SERVER_TASK_TYPE_COMPLETION, + data, + res, + /* oaicompat */ false, + /* oaicompat_chat */ false); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -3537,7 +3509,7 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist - return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); + return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); }; const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -3547,11 +3519,15 @@ int main(int argc, char ** argv) { } json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - data["__oaicompat_chat"] = true; - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); + return handle_completions_generic( + SERVER_TASK_TYPE_COMPLETION, + data, + res, + /* oaicompat */ true, + /* oaicompat_chat */ true); }; - const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { + const auto handle_models = [¶ms, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { json models = { {"object", "list"}, {"data", { @@ -3565,7 +3541,7 @@ int main(int argc, char ** argv) { }} }; - res.set_content(models.dump(), MIMETYPE_JSON); + res_ok(res, models); }; const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { @@ -3642,7 +3618,16 @@ int main(int argc, char ** argv) { json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_EMBEDDING); + std::vector tasks; + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true); + for (size_t i = 0; i < tokenized_prompts.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.prompt_tokens = std::move(tokenized_prompts[i]); + tasks.push_back(task); + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3669,7 +3654,7 @@ int main(int argc, char ** argv) { // write JSON response json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) - : responses[0]; + : responses.size() == 1 ? responses[0] : json(responses); res_ok(res, root); }; @@ -3708,20 +3693,23 @@ int main(int argc, char ** argv) { return; } - // construct prompt object: array of ["query", "doc0", "doc1", ...] - json prompt; - prompt.push_back(query); - for (const auto & doc : documents) { - prompt.push_back(doc); - } - - LOG_DBG("rerank prompt: %s\n", prompt.dump().c_str()); + llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; // create and queue the task json responses = json::array(); bool error = false; { - std::vector tasks = ctx_server.create_tasks_inference({{"prompt", prompt}}, SERVER_TASK_INF_TYPE_RERANK); + std::vector tasks; + std::vector tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); + tasks.reserve(tokenized_docs.size()); + for (size_t i = 0; i < tokenized_docs.size(); i++) { + server_task task = server_task(SERVER_TASK_TYPE_RERANK); + task.id = ctx_server.queue_tasks.get_new_id(); + task.index = i; + task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); + tasks.push_back(task); + } + ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -3782,13 +3770,13 @@ int main(int argc, char ** argv) { } } - server_task task; - task.type = SERVER_TASK_TYPE_SET_LORA; - const int id_task = ctx_server.queue_tasks.post(task); - ctx_server.queue_results.add_waiting_task_id(id_task); + server_task task(SERVER_TASK_TYPE_SET_LORA); + task.id = ctx_server.queue_tasks.get_new_id(); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); - server_task_result_ptr result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { res_error(res, result->to_json()); diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index d82d54a5a..1d5124016 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -22,7 +22,12 @@ def test_server_props(): server.start() res = server.make_request("GET", "/props") assert res.status_code == 200 + assert ".gguf" in res.body["model_path"] assert res.body["total_slots"] == server.n_slots + default_val = res.body["default_generation_settings"] + assert server.n_ctx is not None and server.n_slots is not None + assert default_val["n_ctx"] == server.n_ctx / server.n_slots + assert default_val["params"]["seed"] == server.seed def test_server_models(): @@ -33,6 +38,31 @@ def test_server_models(): assert len(res.body["data"]) == 1 assert res.body["data"][0]["id"] == server.model_alias + +def test_server_slots(): + global server + + # without slots endpoint enabled, this should return error + server.server_slots = False + server.start() + res = server.make_request("GET", "/slots") + assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED + assert "error" in res.body + server.stop() + + # with slots endpoint enabled, this should return slots info + server.server_slots = True + server.n_slots = 2 + server.start() + res = server.make_request("GET", "/slots") + assert res.status_code == 200 + assert len(res.body) == server.n_slots + assert server.n_ctx is not None and server.n_slots is not None + assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots + assert "params" in res.body[0] + assert res.body[0]["params"]["seed"] == server.seed + + def test_load_split_model(): global server server.model_hf_repo = "ggml-org/models" diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index f13c6c4ca..6573cc17f 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 + assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted @@ -59,9 +60,13 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte "stream": True, }) content = "" + last_cmpl_id = None for data in res: choice = data["choices"][0] assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future + if last_cmpl_id is None: + last_cmpl_id = data["id"] + assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index e17a05ff6..69215eaa4 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -64,6 +64,7 @@ class ServerProcess: server_embeddings: bool | None = False server_reranking: bool | None = False server_metrics: bool | None = False + server_slots: bool | None = False draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -91,7 +92,6 @@ class ServerProcess: else: server_path = "../../../build/bin/llama-server" server_args = [ - "--slots", # requires to get slot status via /slots endpoint "--host", self.server_host, "--port", @@ -129,6 +129,8 @@ class ServerProcess: server_args.append("--reranking") if self.server_metrics: server_args.append("--metrics") + if self.server_slots: + server_args.append("--slots") if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: @@ -181,7 +183,7 @@ class ServerProcess: start_time = time.time() while time.time() - start_time < timeout_seconds: try: - response = self.make_request("GET", "/slots", headers={ + response = self.make_request("GET", "/health", headers={ "Authorization": f"Bearer {self.api_key}" if self.api_key else None }) if response.status_code == 200: @@ -224,7 +226,7 @@ class ServerProcess: result.headers = dict(response.headers) result.status_code = response.status_code result.body = response.json() if parse_body else None - print("Response from server", result.body) + print("Response from server", json.dumps(result.body, indent=2)) return result def make_stream_request( @@ -245,7 +247,7 @@ class ServerProcess: break elif line.startswith('data: '): data = json.loads(line[6:]) - print("Partial response from server", data) + print("Partial response from server", json.dumps(data, indent=2)) yield data diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index c9fe7d966..8f545aea5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -164,6 +164,9 @@ static std::vector tokenize_input_prompts(llama_context * ctx, con } else { throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); } + if (result.empty()) { + throw std::runtime_error("\"prompt\" must not be empty"); + } return result; } @@ -496,8 +499,6 @@ static json oaicompat_completion_params_parse( const std::string & chat_template) { json llama_params; - llama_params["__oaicompat"] = true; - // Apply chat template to the list of messages llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); @@ -648,3 +649,18 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } + +static json format_logit_bias(const std::vector & logit_bias) { + json data = json::array(); + for (const auto & lb : logit_bias) { + data.push_back(json{ + {"bias", lb.bias}, + {"token", lb.token}, + }); + } + return data; +} + +static std::string safe_json_to_str(json data) { + return data.dump(-1, ' ', false, json::error_handler_t::replace); +} From 62e84d984875372f4b0fb89a67658e012ff0cc9f Mon Sep 17 00:00:00 2001 From: Robert Collins Date: Sat, 7 Dec 2024 16:12:27 -0500 Subject: [PATCH 26/99] llama : add 128k yarn context for Qwen (#10698) * add 128k yarn context for Qwen * added property for model tensors * removing useless line --- convert_hf_to_gguf.py | 8 ++++++++ gguf-py/gguf/constants.py | 1 + 2 files changed, 9 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a4eece934..c63d929c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1992,6 +1992,14 @@ class Qwen2Model(Model): except FileNotFoundError: self._set_vocab_gpt2() + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 66247b803..4c8710b39 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -761,6 +761,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, From ecc93d0558fc3ecb8a5af69d2ece02fae4710ade Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 8 Dec 2024 02:05:55 -0600 Subject: [PATCH 27/99] vulkan: compile a test shader in cmake to check for coopmat2 support (#10713) --- ggml/src/ggml-vulkan/CMakeLists.txt | 14 ++++++++++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 19 +++++++++++++------ .../vulkan-shaders/test_coopmat2_support.comp | 7 +++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 ++-- 4 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index ae0485e04..546c853b6 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -8,6 +8,20 @@ if (Vulkan_FOUND) ../../include/ggml-vulkan.h ) + # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) + + if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*") + message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc") + else() + message(STATUS "GL_NV_cooperative_matrix2 supported by glslc") + add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + endif() + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4b2e449a5..9e2de9439 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1513,7 +1513,7 @@ static void ggml_vk_load_shaders(vk_device& device) { compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness)); }; -#if defined(VK_NV_cooperative_matrix2) +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array { @@ -1611,7 +1611,7 @@ static void ggml_vk_load_shaders(vk_device& device) { #undef CREATE_MM #undef CREATE_MM2 } else -#endif // defined(VK_NV_cooperative_matrix2) +#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat_support) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ @@ -2153,7 +2153,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix; if (coopmat2_support) { -#if defined(VK_NV_cooperative_matrix2) +#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (coopmat2_features.cooperativeMatrixWorkgroupScope && coopmat2_features.cooperativeMatrixFlexibleDimensions && coopmat2_features.cooperativeMatrixReductions && @@ -2414,14 +2414,19 @@ static void ggml_vk_print_gpu_info(size_t idx) { bool fp16_storage = false; bool fp16_compute = false; bool coopmat_support = false; + bool coopmat2_support = false; for (auto properties : ext_props) { if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { fp16_storage = true; } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { fp16_compute = true; - } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) { + } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; + } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && + !getenv("GGML_VK_DISABLE_COOPMAT2")) { + coopmat2_support = true; } } @@ -2472,9 +2477,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix; + std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; + std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %d\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, coopmat_support); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp new file mode 100644 index 000000000..28eb24e11 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/test_coopmat2_support.comp @@ -0,0 +1,7 @@ +#version 460 + +#extension GL_NV_cooperative_matrix2 : require + +void main() +{ +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index c4967ed65..bc6fca506 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -342,14 +342,14 @@ void process_shaders() { matmul_shaders(true, matmul_id, true, false, false); matmul_shaders(true, matmul_id, true, false, true); -#if defined(VK_NV_cooperative_matrix2) +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) // Coopmat2, fp32acc and fp16acc matmul_shaders(true, matmul_id, false, true, false); matmul_shaders(true, matmul_id, false, true, true); #endif } -#if defined(VK_NV_cooperative_matrix2) +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) // flash attention for (const auto& f16acc : {false, true}) { std::string acctype = f16acc ? "float16_t" : "float"; From 43ed389a3f102517e6f7d5620d8e451e88afbf27 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 8 Dec 2024 12:14:54 +0100 Subject: [PATCH 28/99] llama : use cmake for swift build (#10525) * llama : use cmake for swift build * swift : <> -> "" * ci : remove make * ci : disable ios build * Revert "swift : <> -> """ This reverts commit d39ffd9556482b77d4ea5b118b453fc1c097a31d. * ci : try fix ios build * ci : cont * ci : cont --------- Co-authored-by: Georgi Gerganov --- .github/workflows/build.yml | 107 ++++++++++-------- Package.swift | 78 +------------ Sources/llama/llama.h | 4 + Sources/llama/module.modulemap | 5 + cmake/llama.pc.in | 2 +- .../llama.cpp.swift/LibLlama.swift | 8 +- .../llama.swiftui.xcodeproj/project.pbxproj | 8 +- 7 files changed, 81 insertions(+), 131 deletions(-) create mode 100644 Sources/llama/llama.h create mode 100644 Sources/llama/module.modulemap diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f3326a5fb..886d33d2d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -552,35 +552,44 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO -# TODO: tmp disabled. see for possible re-enable: -# https://github.com/ggerganov/llama.cpp/pull/10525 -# macOS-latest-swift: -# runs-on: macos-latest -# -# strategy: -# matrix: -# destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] -# -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v4 -# -# - name: Dependencies -# id: depends -# continue-on-error: true -# run: | -# brew update -# -# - name: xcodebuild for swift package -# id: xcodebuild -# run: | -# xcodebuild -scheme llama -destination "${{ matrix.destination }}" -# -# - name: Build Swift Example -# id: make_build_swift_example -# run: | -# make swift + macOS-latest-swift: + runs-on: macos-latest + + strategy: + matrix: + destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build llama.cpp with CMake + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}" windows-msys2: runs-on: windows-latest @@ -1104,6 +1113,29 @@ jobs: - name: Checkout code uses: actions/checkout@v4 + - name: Build + id: cmake_build + run: | + sysctl -a + mkdir build + cd build + cmake -G Xcode .. \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + sudo cmake --install . --config Release + + - name: xcodebuild for swift package + id: xcodebuild + run: | + xcodebuild -scheme llama-Package -destination 'generic/platform=iOS' + - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build @@ -1131,23 +1163,6 @@ jobs: ./gradlew build --no-daemon -# freeBSD-latest: -# runs-on: macos-12 -# steps: -# - name: Clone -# uses: actions/checkout@v4 -# -# - name: Build -# uses: cross-platform-actions/action@v0.19.0 -# with: -# operating_system: freebsd -# version: '13.2' -# hypervisor: 'qemu' -# run: | -# sudo pkg update -# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas -# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu` - release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} diff --git a/Package.swift b/Package.swift index 3afeb2f19..01c996d24 100644 --- a/Package.swift +++ b/Package.swift @@ -2,60 +2,6 @@ import PackageDescription -var sources = [ - "src/llama.cpp", - "src/llama-vocab.cpp", - "src/llama-grammar.cpp", - "src/llama-sampling.cpp", - "src/unicode.cpp", - "src/unicode-data.cpp", - "ggml/src/ggml.c", - "ggml/src/ggml-alloc.c", - "ggml/src/ggml-backend.cpp", - "ggml/src/ggml-backend-reg.cpp", - "ggml/src/ggml-cpu/ggml-cpu.c", - "ggml/src/ggml-cpu/ggml-cpu.cpp", - "ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp", - "ggml/src/ggml-cpu/ggml-cpu-hbm.cpp", - "ggml/src/ggml-cpu/ggml-cpu-quants.c", - "ggml/src/ggml-cpu/ggml-cpu-traits.cpp", - "ggml/src/ggml-threading.cpp", - "ggml/src/ggml-quants.c", -] - -var resources: [Resource] = [] -var linkerSettings: [LinkerSetting] = [] -var cSettings: [CSetting] = [ - .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .unsafeFlags(["-fno-objc-arc"]), - .headerSearchPath("ggml/src"), - .headerSearchPath("ggml/src/ggml-cpu"), - // NOTE: NEW_LAPACK will required iOS version 16.4+ - // We should consider add this in the future when we drop support for iOS 14 - // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") - .define("GGML_USE_CPU"), -] - - -#if canImport(Darwin) -sources.append("ggml/src/ggml-common.h") -sources.append("ggml/src/ggml-metal/ggml-metal.m") -resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal")) -linkerSettings.append(.linkedFramework("Accelerate")) -cSettings.append( - contentsOf: [ - .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL"), - ] -) -#endif - -#if os(Linux) - cSettings.append(.define("_GNU_SOURCE")) -#endif - let package = Package( name: "llama", platforms: [ @@ -68,26 +14,6 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], targets: [ - .target( - name: "llama", - path: ".", - exclude: [ - "build", - "cmake", - "examples", - "scripts", - "models", - "tests", - "CMakeLists.txt", - "Makefile", - "ggml/src/ggml-metal-embed.metal" - ], - sources: sources, - resources: resources, - publicHeadersPath: "spm-headers", - cSettings: cSettings, - linkerSettings: linkerSettings - ) - ], - cxxLanguageStandard: .cxx17 + .systemLibrary(name: "llama", pkgConfig: "llama"), + ] ) diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h new file mode 100644 index 000000000..41725880e --- /dev/null +++ b/Sources/llama/llama.h @@ -0,0 +1,4 @@ +#pragma once + +#include + diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap new file mode 100644 index 000000000..d010555b1 --- /dev/null +++ b/Sources/llama/module.modulemap @@ -0,0 +1,5 @@ +module llama [system] { + header "llama.h" + link "llama" + export * +} diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in index 326acbb61..0b2b6bcfa 100644 --- a/cmake/llama.pc.in +++ b/cmake/llama.pc.in @@ -6,5 +6,5 @@ includedir=${prefix}/include Name: llama Description: Port of Facebook's LLaMA model in C/C++ Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lllama +Libs: -L${libdir} -lggml -lggml-base -lllama Cflags: -I${includedir} diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index 65cd4eb51..998c673d5 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -210,20 +210,20 @@ actor LlamaContext { llama_kv_cache_clear(context) - let t_pp_start = ggml_time_us() + let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; if llama_decode(context, batch) != 0 { print("llama_decode() failed during prompt") } llama_synchronize(context) - let t_pp_end = ggml_time_us() + let t_pp_end = DispatchTime.now().uptimeNanoseconds / 1000; // bench text generation llama_kv_cache_clear(context) - let t_tg_start = ggml_time_us() + let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; for i in 0.. Date: Sun, 8 Dec 2024 19:19:19 +0100 Subject: [PATCH 29/99] Vulkan: fix NaN in tanh.comp with AMD proprietary driver on Windows (#10723) * Vulkan: fix NaN in tanh.comp * Faster NaN-free tanh --- ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp index 74630dc7f..495f966bd 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp @@ -16,6 +16,5 @@ void main() { if (i >= p.KX) { return; } - - data_d[i] = D_TYPE(tanh(data_a[i])); + data_d[i] = D_TYPE(1. - 2. / (exp(2.*data_a[i]) + 1.)); } From e52522b8694ae73abf12feb18d29168674aa1c1b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 8 Dec 2024 20:38:51 +0100 Subject: [PATCH 30/99] server : bring back info of final chunk in stream mode (#10722) * server : bring back into to final chunk in stream mode * clarify a bit * traling space --- examples/server/server.cpp | 174 +++++++++--------- examples/server/tests/unit/test_completion.py | 6 + 2 files changed, 94 insertions(+), 86 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1c21e55aa..1d9c0533d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -392,7 +392,7 @@ struct server_task_result { return false; } virtual bool is_stop() { - // only used by server_task_result_cmpl_partial + // only used by server_task_result_cmpl_* return false; } virtual int get_index() { @@ -478,14 +478,20 @@ struct server_task_result_cmpl_final : server_task_result { return index; } + virtual bool is_stop() override { + return true; // in stream mode, final responses are considered stop + } + virtual json to_json() override { - return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat(); + return oaicompat + ? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat()) + : to_json_non_oaicompat(); } json to_json_non_oaicompat() { json res = json { {"index", index}, - {"content", content}, + {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk {"id_slot", id_slot}, {"stop", true}, {"model", oaicompat_model}, @@ -546,18 +552,46 @@ struct server_task_result_cmpl_final : server_task_result { return res; } + + json to_json_oaicompat_chat_stream() { + std::time_t t = std::time(0); + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}, + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + return ret; + } }; struct server_task_result_cmpl_partial : server_task_result { int index = 0; std::string content; - bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; - stop_type stop = STOP_TYPE_NONE; - std::vector probs_output; result_timings timings; @@ -573,20 +607,19 @@ struct server_task_result_cmpl_partial : server_task_result { } virtual bool is_stop() override { - return stop != STOP_TYPE_NONE; + return false; // in stream mode, partial responses are not considered stop } virtual json to_json() override { - if (oaicompat) { - return to_json_oaicompat(); - } - bool is_stop = stop != STOP_TYPE_NONE; + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { // non-OAI-compat JSON json res = json { {"index", index}, {"content", content}, - {"stop_type", stop_type_to_str(stop)}, - {"stop", is_stop}, + {"stop", false}, {"id_slot", id_slot}, {"tokens_predicted", n_decoded}, {"tokens_evaluated", n_prompt_tokens}, @@ -598,72 +631,54 @@ struct server_task_result_cmpl_partial : server_task_result { if (!probs_output.empty()) { res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); } - if (is_stop) { - res.push_back({"truncated", truncated}); - } return res; } json to_json_oaicompat() { bool first = n_decoded == 0; - - std::string finish_reason; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } else if (stop == STOP_TYPE_LIMIT) { - finish_reason = "length"; - } - std::time_t t = std::time(0); - json choices; - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } + {"delta", json{{"role", "assistant"}}}}}); } else { - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); } + } else { + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); } json ret = json { @@ -678,14 +693,6 @@ struct server_task_result_cmpl_partial : server_task_result { ret.push_back({"timings", timings.to_json()}); } - if (!finish_reason.empty()) { - ret.push_back({"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}); - } - return std::vector({ret}); } }; @@ -1888,12 +1895,9 @@ struct server_context { res->index = slot.index; res->content = tkn.text_to_send; - res->truncated = slot.truncated; res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; - res->stop = slot.stop; - res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; @@ -1924,12 +1928,6 @@ struct server_context { } void send_final_response(server_slot & slot) { - if (slot.params.stream) { - // if in stream mode, send the last partial response - send_partial_response(slot, {0, "", {}}); - return; - } - auto res = std::make_unique(); res->id = slot.id_task; res->id_slot = slot.id; @@ -1948,6 +1946,7 @@ struct server_context { res->stop = slot.stop; res->verbose = slot.params.verbose; + res->stream = slot.params.stream; res->oaicompat = slot.params.oaicompat; res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; @@ -2100,7 +2099,10 @@ struct server_context { return; } - GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); if (!result_handler(result)) { cancel_tasks(id_tasks); break; diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 1c3aa77de..7f4f9cd03 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -42,10 +42,16 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp }) content = "" for data in res: + assert "stop" in data and type(data["stop"]) == bool if data["stop"]: assert data["timings"]["prompt_n"] == n_prompt assert data["timings"]["predicted_n"] == n_predicted assert data["truncated"] == truncated + assert data["stop_type"] == "limit" + assert "generation_settings" in data + assert server.n_predict is not None + assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) + assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: content += data["content"] From ce8784bdb153ff7794dde5a50b0ebfa51baa6171 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 8 Dec 2024 23:04:29 +0100 Subject: [PATCH 31/99] server : fix format_infill (#10724) * server : fix format_infill * fix * rename * update test * use another model * update test * update test * test_invalid_input_extra_req --- examples/server/server.cpp | 22 ++++++++++++++ examples/server/tests/unit/test_infill.py | 36 ++++++++++++++++++----- examples/server/tests/utils.py | 3 ++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1d9c0533d..47bfd6c4a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3484,6 +3484,11 @@ int main(int argc, char ** argv) { json data = json::parse(req.body); // validate input + if (data.contains("prompt") && !data.at("prompt").is_string()) { + // prompt is optional + res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST)); + } + if (!data.contains("input_prefix")) { res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST)); } @@ -3493,9 +3498,11 @@ int main(int argc, char ** argv) { } if (data.contains("input_extra") && !data.at("input_extra").is_array()) { + // input_extra is optional res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST)); return; } + json input_extra = json_value(data, "input_extra", json::array()); for (const auto & chunk : input_extra) { // { "text": string, "filename": string } @@ -3511,6 +3518,21 @@ int main(int argc, char ** argv) { } data["input_extra"] = input_extra; // default to empty array if it's not exist + std::string prompt = json_value(data, "prompt", std::string()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); + data["prompt"] = format_infill( + ctx_server.ctx, + data.at("input_prefix"), + data.at("input_suffix"), + data.at("input_extra"), + ctx_server.params_base.n_batch, + ctx_server.params_base.n_predict, + ctx_server.slots[0].n_ctx, // TODO: there should be a better way + ctx_server.params_base.spm_infill, + tokenized_prompts[0] + ); + return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); }; diff --git a/examples/server/tests/unit/test_infill.py b/examples/server/tests/unit/test_infill.py index 6a6d40a1c..ad4b8192a 100644 --- a/examples/server/tests/unit/test_infill.py +++ b/examples/server/tests/unit/test_infill.py @@ -13,28 +13,28 @@ def test_infill_without_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(One|day|she|saw|big|scary|bird)+", res.body["content"]) + assert match_regex("(Ann|small|shiny)+", res.body["content"]) def test_infill_with_input_extra(): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [{ "filename": "llama.h", "text": "LLAMA_API int32_t llama_n_threads();\n" }], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 200 - assert match_regex("(cuts|Jimmy|mom|came|into|the|room)+", res.body["content"]) + assert match_regex("(Dad|excited|park)+", res.body["content"]) @pytest.mark.parametrize("input_extra", [ @@ -48,10 +48,30 @@ def test_invalid_input_extra_req(input_extra): global server server.start() res = server.make_request("POST", "/infill", data={ - "prompt": "Complete this", "input_extra": [input_extra], - "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n int n_threads = llama_", + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", "input_suffix": "}\n", }) assert res.status_code == 400 assert "error" in res.body + + +@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test") +def test_with_qwen_model(): + global server + server.model_file = None + server.model_hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-IQ3_XXS-GGUF" + server.model_hf_file = "qwen2.5-coder-1.5b-iq3_xxs-imat.gguf" + server.start(timeout_seconds=600) + res = server.make_request("POST", "/infill", data={ + "input_extra": [{ + "filename": "llama.h", + "text": "LLAMA_API int32_t llama_n_threads();\n" + }], + "input_prefix": "#include \n#include \"llama.h\"\n\nint main() {\n", + "prompt": " int n_threads = llama_", + "input_suffix": "}\n", + }) + assert res.status_code == 200 + assert res.body["content"] == "n_threads();\n printf(\"Number of threads: %d\\n\", n_threads);\n return 0;\n" diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 69215eaa4..7c89b9cd3 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -371,3 +371,6 @@ def match_regex(regex: str, text: str) -> bool: ).search(text) is not None ) + +def is_slow_test_allowed(): + return os.environ.get("SLOW_TESTS") == "1" or os.environ.get("SLOW_TESTS") == "ON" From 1a05004743e00aca400833be625f0ec8cce176a7 Mon Sep 17 00:00:00 2001 From: Borislav Stanimirov Date: Mon, 9 Dec 2024 09:15:13 +0200 Subject: [PATCH 32/99] cmake : simplify msvc charsets (#10672) --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f84fff9e6..a717a508f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,11 +46,9 @@ if (WIN32) add_compile_definitions(_CRT_SECURE_NO_WARNINGS) endif() -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") - add_compile_options("$<$:/source-charset:utf-8>") - add_compile_options("$<$:/source-charset:utf-8>") - add_compile_options("$<$:/execution-charset:utf-8>") - add_compile_options("$<$:/execution-charset:utf-8>") +if (MSVC) + add_compile_options("$<$:/utf-8>") + add_compile_options("$<$:/utf-8>") endif() # From 3d98b4cb226c3140bd1ae6c65ed126b7d90332fa Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 9 Dec 2024 01:24:01 -0600 Subject: [PATCH 33/99] vulkan: fix compile warnings (#10731) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 9e2de9439..5d9eba983 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -427,7 +427,7 @@ static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_pus // and a shift: // // n/d = (mulhi(n, mp) + n) >> L; -void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) +static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) { // compute L = ceil(log2(d)); L = 0; @@ -439,6 +439,7 @@ void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) } template void init_pushconst_fastdiv(T &p) { + GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); } @@ -3417,7 +3418,7 @@ static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int return split_k; } -static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type type_a) { +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ")"); if (ctx->device->coopmat2) { @@ -3439,9 +3440,9 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, return aligned ? mmp->a_l : mmp->l; } -static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, ggml_type type_a) { +static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n) { VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")"); - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true, type_a)->align; + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; } static void ggml_vk_matmul( @@ -3571,6 +3572,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; init_pushconst_fastdiv(pc); ggml_vk_sync_buffers(subctx); @@ -3644,10 +3646,10 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, src0->type)); + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; - vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, src0->type); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); @@ -5351,7 +5353,8 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f + op_params[0], 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5365,6 +5368,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5378,6 +5382,7 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5391,6 +5396,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5405,6 +5411,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, con (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, op_params[0], op_params[1], + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5418,6 +5425,7 @@ static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5431,6 +5439,7 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, co (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } @@ -5445,6 +5454,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, d_offset, 0.0f, 0.0f, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, dryrun); } From c37fb4cf62ddf0d33562c4c4a4d6fb45e32ad3b6 Mon Sep 17 00:00:00 2001 From: Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com> Date: Mon, 9 Dec 2024 23:10:19 +0530 Subject: [PATCH 34/99] Changes to CMakePresets.json to add ninja clang target on windows (#10668) * Update cmakepreset.json to use clang with ninja by default * Update cmakepreset.json to add clang and ninja based configs * Updates to build.md file * Make updates to rename preset targets * Update with .cmake file * Remove additional whitespaces * Add .cmake file for x64-windows-llvm * Update docs/build.md * Update docs/build.md --------- Co-authored-by: Max Krasnyansky --- CMakePresets.json | 12 ++++++++++++ cmake/x64-windows-llvm.cmake | 11 +++++++++++ docs/build.md | 7 +++++++ 3 files changed, 30 insertions(+) create mode 100644 cmake/x64-windows-llvm.cmake diff --git a/CMakePresets.json b/CMakePresets.json index 436448967..13bdd7907 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -31,6 +31,13 @@ { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } }, { "name": "vulkan", "hidden": true, "cacheVariables": { "GGML_VULKAN": "ON" } }, + { + "name": "x64-windows-llvm", "hidden": true, + "cacheVariables": { + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake" + } + }, + { "name": "arm64-windows-msvc", "hidden": true, "architecture": { "value": "arm64", "strategy": "external" }, @@ -70,6 +77,11 @@ { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] }, { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] }, + { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] }, + { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] }, + { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] }, + { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] }, + { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] }, { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] }, { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] }, diff --git a/cmake/x64-windows-llvm.cmake b/cmake/x64-windows-llvm.cmake new file mode 100644 index 000000000..0603d738f --- /dev/null +++ b/cmake/x64-windows-llvm.cmake @@ -0,0 +1,11 @@ +set( CMAKE_SYSTEM_NAME Windows ) +set( CMAKE_SYSTEM_PROCESSOR x86_64 ) + +set( CMAKE_C_COMPILER clang ) +set( CMAKE_CXX_COMPILER clang++ ) + +set( arch_c_flags "-march=native" ) + +set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" ) +set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" ) + diff --git a/docs/build.md b/docs/build.md index 26e673788..84019b204 100644 --- a/docs/build.md +++ b/docs/build.md @@ -57,6 +57,13 @@ cmake --build build --config Release ``` Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels. + For building with ninja generator and clang compiler as default: + -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64 + ```bash + cmake --preset x64-windows-llvm-release + cmake --build build-x64-windows-llvm-release + ``` + ## BLAS Build Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Using BLAS doesn't affect the generation performance. There are currently several different BLAS implementations available for build and use: From 26a8406ba9198eb6fdd8329fa717555b4f77f05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 9 Dec 2024 20:07:12 +0100 Subject: [PATCH 35/99] CUDA: fix shared memory access condition for mmv (#10740) --- ggml/src/ggml-cuda/mmv.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index cfe91f428..a4b4f6bc1 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -57,7 +57,7 @@ static __global__ void mul_mat_vec( if (block_size > WARP_SIZE) { buf_iw[tid/WARP_SIZE] = sumf; __syncthreads(); - if (tid > WARP_SIZE) { + if (tid >= WARP_SIZE) { return; } sumf = buf_iw[tid]; From a05e2afcc241c1ecd38ec5cb4c579d90cdf3f918 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 10 Dec 2024 11:22:20 -0600 Subject: [PATCH 36/99] vulkan: disable spirv-opt for coopmat shaders (#10763) There are some bugs in the 1.3.296 SDK, so disable this. It isn't strictly necessary anyway. Add missing dependency on vulkan-shaders-gen, so shaders get recompiled when it changes. Fix coopmat support reporting when glslc doesn't support NV_coopmat2. --- ggml/src/ggml-vulkan/CMakeLists.txt | 2 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 ++ ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 546c853b6..6d46e5f24 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -81,7 +81,7 @@ if (Vulkan_FOUND) --target-cpp ${_ggml_vk_source} --no-clean - DEPENDS ${_ggml_vk_shader_deps} + DEPENDS ${_ggml_vk_shader_deps} ${_ggml_vk_genshaders_cmd} COMMENT "Generate vulkan shaders" ) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 5d9eba983..ad5535c11 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2425,9 +2425,11 @@ static void ggml_vk_print_gpu_info(size_t idx) { } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT")) { coopmat_support = true; +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 && !getenv("GGML_VK_DISABLE_COOPMAT2")) { coopmat2_support = true; +#endif } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index bc6fca506..ee3520833 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -206,10 +206,13 @@ void string_to_spv_func(const std::string& _name, const std::string& in_fname, c std::string target_env = (name.find("_cm2") != std::string::npos) ? "--target-env=vulkan1.3" : "--target-env=vulkan1.2"; + // disable spirv-opt for coopmat shaders for https://github.com/ggerganov/llama.cpp/issues/10734 + std::string opt_level = coopmat ? "" : "-O"; + #ifdef _WIN32 - std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""}; #else - std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, "-O", in_path, "-o", out_fname}; + std::vector cmd = {GLSLC, "-fshader-stage=compute", target_env, opt_level, in_path, "-o", out_fname}; #endif #ifdef GGML_VULKAN_SHADER_DEBUG_INFO From a86ad841f103e471ac0fd7ee8852d1eb5015ce89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Y=C3=BCg?= Date: Tue, 10 Dec 2024 17:22:34 +0000 Subject: [PATCH 37/99] server : add flag to disable the web-ui (#10762) (#10751) Co-authored-by: eugenio.segala --- common/arg.cpp | 7 ++++++ examples/server/README.md | 1 + examples/server/server.cpp | 30 ++++++++++++++---------- examples/server/tests/unit/test_basic.py | 18 ++++++++++++++ examples/server/tests/utils.py | 3 +++ 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0db59f701..808ec1c3e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1711,6 +1711,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.public_path = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH")); + add_opt(common_arg( + {"--no-webui"}, + string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params) { + params.webui = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), diff --git a/examples/server/README.md b/examples/server/README.md index 117c52d3f..6294f541f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -146,6 +146,7 @@ The project is under active development, and we are [looking for feedback and co | `--host HOST` | ip address to listen (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | +| `--no-webui` | disable the Web UI
(env: LLAMA_ARG_NO_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 47bfd6c4a..8cb992470 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3815,20 +3815,24 @@ int main(int argc, char ** argv) { // Router // - // register static assets routes - if (!params.public_path.empty()) { - // Set the base directory for serving static files - bool is_found = svr->set_mount_point("/", params.public_path); - if (!is_found) { - LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); - return 1; - } + if (!params.webui) { + LOG_INF("Web UI is disabled\n"); } else { - // using embedded static index.html - svr->Get("/", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); - return false; - }); + // register static assets routes + if (!params.public_path.empty()) { + // Set the base directory for serving static files + bool is_found = svr->set_mount_point("/", params.public_path); + if (!is_found) { + LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str()); + return 1; + } + } else { + // using embedded static index.html + svr->Get("/", [](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + return false; + }); + } } // register API routes diff --git a/examples/server/tests/unit/test_basic.py b/examples/server/tests/unit/test_basic.py index 1d5124016..1485de8ce 100644 --- a/examples/server/tests/unit/test_basic.py +++ b/examples/server/tests/unit/test_basic.py @@ -1,4 +1,5 @@ import pytest +import requests from utils import * server = ServerPreset.tinyllama2() @@ -76,3 +77,20 @@ def test_load_split_model(): }) assert res.status_code == 200 assert match_regex("(little|girl)+", res.body["content"]) + + +def test_no_webui(): + global server + # default: webui enabled + server.start() + url = f"http://{server.server_host}:{server.server_port}" + res = requests.get(url) + assert res.status_code == 200 + assert "" in res.text + server.stop() + + # with --no-webui + server.no_webui = True + server.start() + res = requests.get(url) + assert res.status_code == 404 diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 7c89b9cd3..d988ccf5e 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -72,6 +72,7 @@ class ServerProcess: disable_ctx_shift: int | None = False draft_min: int | None = None draft_max: int | None = None + no_webui: bool | None = None # session variables process: subprocess.Popen | None = None @@ -158,6 +159,8 @@ class ServerProcess: server_args.extend(["--draft-max", self.draft_max]) if self.draft_min: server_args.extend(["--draft-min", self.draft_min]) + if self.no_webui: + server_args.append("--no-webui") args = [str(arg) for arg in [server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") From 750cb3e246f7e544124cf18cb0c5e0c8b7e38738 Mon Sep 17 00:00:00 2001 From: Andreas Kieslinger <47689530+aendk@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:23:24 +0100 Subject: [PATCH 38/99] CUDA: rename macros to avoid conflicts with WinAPI (#10736) * Renames NVIDIA GPU-architecture flags to avoid name clashes with WinAPI. (e.g. CC_PASCAL, GPU architecture or WinAPI pascal compiler flag?) * Reverts erroneous rename in SYCL-code. * Renames GGML_CUDA_MIN_CC_DP4A to GGML_CUDA_CC_DP4A. * Renames the rest of the compute capability macros for consistency. --- ggml/src/ggml-common.h | 2 +- ggml/src/ggml-cuda/common.cuh | 70 ++++++++++++++++----------------- ggml/src/ggml-cuda/convert.cu | 6 +-- ggml/src/ggml-cuda/fattn.cu | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 12 +++--- ggml/src/ggml-cuda/mma.cuh | 8 ++-- ggml/src/ggml-cuda/mmq.cu | 10 ++--- ggml/src/ggml-cuda/mmq.cuh | 26 ++++++------ ggml/src/ggml-cuda/mmvq.cu | 2 +- ggml/src/ggml-cuda/sum.cu | 2 - 10 files changed, 69 insertions(+), 71 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 7fd2aadec..f13fd4dea 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -473,7 +473,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128) 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, GGML_TABLE_END() -//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 535118d87..2c0a56226 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -41,28 +41,28 @@ #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons -#define CC_PASCAL 600 -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_VOLTA 700 -#define CC_TURING 750 -#define CC_AMPERE 800 -#define CC_OFFSET_AMD 1000000 +#define GGML_CUDA_CC_PASCAL 600 +#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define GGML_CUDA_CC_VOLTA 700 +#define GGML_CUDA_CC_TURING 750 +#define GGML_CUDA_CC_AMPERE 800 +#define GGML_CUDA_CC_OFFSET_AMD 1000000 // GCN/CNDA, wave size is 64 -#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 -#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue -#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a -#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers -#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing -#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300 +#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 +#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue +#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a +#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers +#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing +#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000 -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA +#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000 +#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a +#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA -#define CC_QY1 210 -#define CC_QY2 220 +#define GGML_CUDA_CC_QY1 210 +#define GGML_CUDA_CC_QY2 220 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -131,36 +131,36 @@ typedef float dfloat; // dequantize float typedef float2 dfloat2; #endif // GGML_CUDA_F16 -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #define FP16_AVAILABLE -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING #define INT8_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING -#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) +#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) #define FLASH_ATTN_AVAILABLE -#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) +#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) static constexpr bool fast_fp16_available(const int cc) { - return cc >= CC_PASCAL && cc != 610; + return cc >= GGML_CUDA_CC_PASCAL && cc != 610; } static constexpr bool fp16_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA; } static constexpr bool int8_mma_available(const int cc) { - return cc < CC_OFFSET_AMD && cc >= CC_TURING; + return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING; } [[noreturn]] @@ -187,7 +187,7 @@ static __device__ void no_device_code( #endif // __CUDA_ARCH__ static __device__ __forceinline__ int warp_reduce_sum(int x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE return __reduce_add_sync(0xffffffff, x); #else #pragma unroll @@ -195,7 +195,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { x += __shfl_xor_sync(0xffffffff, x, offset, 32); } return x; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE } static __device__ __forceinline__ float warp_reduce_sum(float x) { @@ -284,7 +284,7 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal } static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL #pragma unroll for (int offset = 16; offset > 0; offset >>= 1) { x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32)); @@ -293,7 +293,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #else GGML_UNUSED(x); NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL } #if CUDART_VERSION < CUDART_HMASK @@ -333,13 +333,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if __CUDA_ARCH__ >= MIN_CC_DP4A +#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A return __dp4a(a, b, c); -#else // __CUDA_ARCH__ >= MIN_CC_DP4A +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A const int8_t * a8 = (const int8_t *) &a; const int8_t * b8 = (const int8_t *) &b; return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c0a444707..3896f956d 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -26,7 +26,7 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __ template static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) { -#if __CUDA_ARCH__ >= CC_PASCAL +#if __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; @@ -64,7 +64,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h GGML_UNUSED(y); GGML_UNUSED(k); NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_PASCAL +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL } template @@ -599,7 +599,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { case GGML_TYPE_Q5_1: return dequantize_block_cuda; case GGML_TYPE_Q8_0: - if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) { + if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_PASCAL) { return dequantize_block_q8_0_f16_cuda; } return dequantize_block_cuda; diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 0e7ebbc53..0b26b0f8e 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV); // On AMD the tile kernels perform poorly, use the vec kernel instead: - if (cc >= CC_OFFSET_AMD) { + if (cc >= GGML_CUDA_CC_OFFSET_AMD) { if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) { ggml_cuda_flash_attn_ext_vec_f16(ctx, dst); } else { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 15fcb2a65..c180adc84 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].smpb = prop.sharedMemPerBlock; #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpbo = prop.sharedMemPerBlock; - info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; + info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD; #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; @@ -1081,7 +1081,7 @@ static void ggml_cuda_op_mul_mat_cublas( const int compute_capability = ggml_cuda_info().devices[id].cc; - if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { + if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { @@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas( const half beta_f16 = 0.0f; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; - if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) { cu_compute_type = CUBLAS_COMPUTE_32F; } @@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; cudaDataType_t cu_data_type = CUDA_R_16F; - if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) { + if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) { cu_compute_type = CUBLAS_COMPUTE_32F; } @@ -2357,7 +2357,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, std::vector ggml_cuda_cpy_fn_ptrs; if (cuda_ctx->cuda_graph->graph == nullptr) { - if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) { cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; #ifndef NDEBUG GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__); @@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return true; } const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; - return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; + return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16; } case GGML_OP_CROSS_ENTROPY_LOSS: case GGML_OP_CROSS_ENTROPY_LOSS_BACK: diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh index a452a3cc3..7d11540af 100644 --- a/ggml/src/ggml-cuda/mma.cuh +++ b/ggml/src/ggml-cuda/mma.cuh @@ -171,7 +171,7 @@ struct mma_int_C_I16J8 { __device__ __forceinline__ void mma_K4(const mma_int_A_I16K4 & mma_A, const mma_int_B_J8K4 & mma_B) { #ifdef INT8_MMA_AVAILABLE -#if __CUDA_ARCH__ >= CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};" : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_B.x[0])); @@ -183,7 +183,7 @@ struct mma_int_C_I16J8 { asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" : "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[1]), "r"(mma_B.x[0])); -#endif // __CUDA_ARCH__ >= CC_AMPERE +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else GGML_UNUSED(mma_A); GGML_UNUSED(mma_B); @@ -193,7 +193,7 @@ struct mma_int_C_I16J8 { __device__ __forceinline__ void mma_K8(const mma_int_A_I16K8 & mma_A, const mma_int_B_J8K8 & mma_B) { #ifdef INT8_MMA_AVAILABLE -#if __CUDA_ARCH__ >= CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};" : "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[0]), "r"(mma_A.x[1]), "r"(mma_A.x[2]), "r"(mma_A.x[3]), "r"(mma_B.x[0]), "r"(mma_B.x[1])); @@ -211,7 +211,7 @@ struct mma_int_C_I16J8 { asm("mma.sync.aligned.m8n8k16.row.col.s32.s8.s8.s32 {%0, %1}, {%2}, {%3}, {%0, %1};" : "+r"(x[2]), "+r"(x[3]) : "r"(mma_A.x[3]), "r"(mma_B.x[1])); -#endif // __CUDA_ARCH__ >= CC_AMPERE +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE #else GGML_UNUSED(mma_A); GGML_UNUSED(mma_B); diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 7f7c8c90b..270251df4 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q( // The stream-k decomposition is only faster for recent NVIDIA GPUs. // Also its fixup needs to allocate a temporary buffer in the memory pool. // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer. - const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11; + const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11; const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k}; switch (src0->type) { @@ -136,7 +136,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return true; } - if (cc < MIN_CC_DP4A) { + if (cc < GGML_CUDA_CC_DP4A) { return false; } @@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) { return true; #endif //GGML_CUDA_FORCE_MMQ - if (cc < CC_OFFSET_AMD) { - return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + if (cc < GGML_CUDA_CC_OFFSET_AMD) { + return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } - return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; + return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE; } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 8d8867121..3cd508a1d 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -89,9 +89,9 @@ struct tile_x_sizes { static constexpr int get_mmq_x_max_host(const int cc) { return int8_mma_available(cc) ? 128 : #ifdef GGML_CUDA_FORCE_MMQ - cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64; + cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64; #else - cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; + cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64; #endif // GGML_CUDA_FORCE_MMQ } @@ -104,23 +104,23 @@ static constexpr __device__ int get_mmq_x_max_device() { return 128; #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #ifdef GGML_CUDA_FORCE_MMQ return MMQ_DP4A_MAX_BATCH_SIZE; #else // GGML_CUDA_FORCE_MMQ return 128; #endif // GGML_CUDA_FORCE_MMQ -#else // __CUDA_ARCH__ >= CC_VOLTA +#else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA return 64; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // INT8_MMA_AVAILABLE } static constexpr int get_mmq_y_host(const int cc) { - return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); + return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64); } static constexpr __device__ int get_mmq_y_device() { @@ -131,11 +131,11 @@ static constexpr __device__ int get_mmq_y_device() { return 128; #endif // defined RDNA1 #else -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA return 128; #else return 64; -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } @@ -2574,11 +2574,11 @@ template __launch_bounds__(WARP_SIZE*nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) || defined(CDNA) || defined(GCN) #else -#if __CUDA_ARCH__ >= CC_VOLTA +#if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA __launch_bounds__(WARP_SIZE*nwarps, 1) #else __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // __CUDA_ARCH__ >= CC_VOLTA +#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, @@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q( constexpr int mmq_y = get_mmq_y_device(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA { constexpr bool fixup = false; mul_mat_q_process_tile @@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q( blockIdx.x, blockIdx.y, 0, ne00/qk); return; } -#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA const int64_t blocks_per_ne00 = ne00 / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda const int mmq_x_max = get_mmq_x_max_host(cc); const int mmq_y = get_mmq_y_host(cc); const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y; - const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD; + const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD; int mmq_x_best = 0; int nparts_best = INT_MAX; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 02d150983..e3b912d87 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda( int64_t nwarps = 1; int64_t rows_per_cuda_block = 1; - if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA + if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA switch(ncols_y) { case 1: nwarps = 4; diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 31cfe5394..e0dafc1d2 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -3,8 +3,6 @@ #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #ifdef USE_CUB -// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. -// For this reason CUB must be included BEFORE anything else. #include using namespace cub; #endif // USE_CUB From ae4b922614d452477cf5d2fb8cad247c9c12596c Mon Sep 17 00:00:00 2001 From: Bartowski Date: Tue, 10 Dec 2024 12:23:50 -0500 Subject: [PATCH 39/99] imatrix : Add imatrix to --no-context-shift (#10766) This allows for setting the --no-context-shift value in llama-imatrix which is required for models like DeepSeek --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 808ec1c3e..49af31682 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -591,7 +591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), From dafae66cc242eb766797194d3c85c5e502625623 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Tue, 10 Dec 2024 19:33:23 +0000 Subject: [PATCH 40/99] vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 30 ++++++++----------- .../vulkan-shaders/mul_mat_vec_base.comp | 2 -- .../vulkan-shaders/mul_mat_vec_q2_k.comp | 27 ++++++++++------- .../vulkan-shaders/mul_mat_vec_q3_k.comp | 27 ++++++++++------- .../vulkan-shaders/mul_mat_vec_q4_k.comp | 26 +++++++++------- .../vulkan-shaders/mul_mat_vec_q5_k.comp | 21 ++++++++----- 6 files changed, 72 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ad5535c11..bb9572d76 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -44,12 +44,6 @@ #define MAX_VK_BUFFERS 256 -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 1 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - #define VK_CHECK(err, msg) \ do { \ vk::Result err_ = (err); \ @@ -1792,10 +1786,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); @@ -1806,10 +1800,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size}, 1, true); @@ -1820,10 +1814,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size, 1}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {device->subgroup_size}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1, true); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp index 2ec1af5c7..3894fca82 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp @@ -2,8 +2,6 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require -#define K_QUANTS_PER_ITERATION 2 - #ifdef MUL_MAT_ID #define EXPERT_COUNT 8 #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index fcf02210e..1a5350d99 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -3,9 +3,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -20,22 +22,25 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; - const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = tid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - step*v_im; // 0...15 or 0...7 - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint s_offset = 8*v_im; const uint y_offset = 128*v_im + l0; FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; f16vec2 d = data_a[ib0 + i].d; @@ -71,7 +76,7 @@ void main() { FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + [[unroll]] for (int l = 0; l < 2; ++l) { sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), @@ -96,7 +101,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index 723fadde0..b19c38111 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -3,9 +3,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -20,17 +22,20 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + const uint step = 8; - const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const uint v_in = tid - step*v_im; // 0...15 or 0...7 + const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const uint v_in = itid - step*v_im; // 0...15 or 0...7 const uint8_t m = uint8_t(1 << (4 * v_im)); - const uint l0 = K_QUANTS_PER_ITERATION*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint y_offset = 128*v_im + l0; @@ -38,7 +43,7 @@ void main() { const uint s_shift = 4 * v_im; - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); @@ -66,7 +71,7 @@ void main() { u8vec2 s10 = unpack8(s10_16); FLOAT_TYPE sum = FLOAT_TYPE(0.0); - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + [[unroll]] for (int l = 0; l < 2; ++l) { sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), @@ -83,7 +88,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index 5846f2e86..b86d28589 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -4,11 +4,12 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; -// This shader assumes K_QUANTS_PER_ITERATION == 2 for alignment of loads void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -22,14 +23,17 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + const uint step = 4; - const uint il = tid/step; // 0...3 - const uint ir = tid - step*il; // 0...7 or 0...3 - const uint n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + const uint il = itid/step; // 0...3 + const uint ir = itid - step*il; // 0...7 or 0...3 + const uint n = 4; const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -40,7 +44,7 @@ void main() { FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; @@ -115,7 +119,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index b455cbd31..fd243cf91 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -4,9 +4,11 @@ #include "mul_mat_vec_base.comp" -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE tmp[32]; +layout (constant_id = 0) const uint BLOCK_SIZE = 32; + +shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; @@ -21,11 +23,14 @@ void main() { const uint num_blocks_per_row = p.ncols / QUANT_K; const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; - const uint tid = gl_LocalInvocationID.x/2; // 0...31 or 0...16 - const uint ix = gl_LocalInvocationID.x%2; // 0 or 0, 1 + // 16 threads are used to process each block + const uint it_size = gl_WorkGroupSize.x/16; + const uint tid = gl_LocalInvocationID.x; + const uint itid = tid%16; // 0...16 + const uint ix = tid/16; - const uint il = tid/4; // 0...3 - const uint ir = tid - 4*il; // 0...7 or 0...3 + const uint il = itid/4; // 0...3 + const uint ir = itid - 4*il; // 0...7 or 0...3 const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const uint v_in = il % 2; @@ -36,7 +41,7 @@ void main() { FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp - [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += 2) { + [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; @@ -143,7 +148,7 @@ void main() { // sum up partial sums and write back result barrier(); - [[unroll]] for (uint s = 16; s > 0; s >>= 1) { + [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { if (tid < s) { tmp[tid] += tmp[tid + s]; } From b685daf3867c54e42a9db484d7b92619021d4510 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Tue, 10 Dec 2024 14:23:17 -0600 Subject: [PATCH 41/99] vulkan: request round-to-even for fp16 in im2col/rope_head (#10767) Vulkan doesn't mandate a specific rounding mode, but the shader_float_controls feature allows rounding mode to be requested if the implementation supports it. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 23 +++++++++++++++---- .../ggml-vulkan/vulkan-shaders/im2col.comp | 5 ++++ .../ggml-vulkan/vulkan-shaders/rope_head.comp | 5 ++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 3 +++ 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index bb9572d76..a8ae58ee2 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -162,6 +162,7 @@ struct vk_device_struct { uint32_t subgroup_size; uint32_t shader_core_count; bool uma; + bool float_controls_rte_fp16; bool coopmat2; bool coopmat_support; @@ -1916,17 +1917,26 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_rte_len, rope_norm_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_rte_len, rope_neox_f16_rte_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_im2col_f32, "im2col_f32", im2col_f32_len, im2col_f32_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + if (device->float_controls_rte_fp16) { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_rte_len, im2col_f32_f16_rte_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } else { + ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1); + } ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); @@ -2007,11 +2017,13 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceDriverProperties driver_props; vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props; vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props; + vk::PhysicalDeviceVulkan12Properties vk12_props; props2.pNext = &props3; props3.pNext = &subgroup_props; subgroup_props.pNext = &driver_props; + driver_props.pNext = &vk12_props; - VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&driver_props; + VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props; if (maintenance4_support) { last_struct->pNext = (VkBaseOutStructure *)&props4; @@ -2057,6 +2069,7 @@ static vk_device ggml_vk_get_device(size_t idx) { } else { device->shader_core_count = 0; } + device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16; const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp index 4d48610a3..966fedf8f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp @@ -1,6 +1,11 @@ #version 450 #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout (push_constant) uniform parameter { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp index ea8954226..574b51ca5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp @@ -1,6 +1,11 @@ #include "types.comp" #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_spirv_intrinsics: enable + +#if RTE16 +spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bits +#endif layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index ee3520833..c48a228ae 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -461,9 +461,11 @@ void process_shaders() { string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}}); string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}}); @@ -471,6 +473,7 @@ void process_shaders() { string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}})); + string_to_spv("im2col_f32_f16_rte", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}})); string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); From 43041d2eb32623d5b6ca734313327abe96f73146 Mon Sep 17 00:00:00 2001 From: "Gilad S." <7817232+giladgd@users.noreply.github.com> Date: Wed, 11 Dec 2024 02:47:21 +0200 Subject: [PATCH 42/99] ggml: load all backends from a user-provided search path (#10699) * feat: load all backends from a user-provided search path * fix: Windows search path * refactor: rename `ggml_backend_load_all_in_search_path` to `ggml_backend_load_all_from_path` * refactor: rename `search_path` to `dir_path` * fix: change `NULL` to `nullptr` Co-authored-by: Diego Devesa * fix: change `NULL` to `nullptr` --------- Co-authored-by: Diego Devesa --- ggml/include/ggml-backend.h | 1 + ggml/src/ggml-backend-reg.cpp | 40 +++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 19881a505..7221a0830 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -228,6 +228,7 @@ extern "C" { GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); // Load all known backends from dynamic libraries GGML_API void ggml_backend_load_all(void); + GGML_API void ggml_backend_load_all_from_path(const char * dir_path); // // Backend scheduler diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 5cb0fb9d1..2e7340145 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -449,11 +449,21 @@ static std::string backend_filename_suffix() { #endif } -static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) { +static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // TODO: search system paths - std::vector search_paths = { "./", get_executable_path() }; std::string file_prefix = backend_filename_prefix() + name + "-"; + std::vector search_paths; + if (user_search_path == nullptr) { + search_paths.push_back("./"); + search_paths.push_back(get_executable_path()); + } else { +#if defined(_WIN32) + search_paths.push_back(std::string(user_search_path) + "\\"); +#else + search_paths.push_back(std::string(user_search_path) + "/"); +#endif + } int best_score = 0; std::string best_path; @@ -509,21 +519,25 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) } void ggml_backend_load_all() { + ggml_backend_load_all_from_path(nullptr); +} + +void ggml_backend_load_all_from_path(const char * dir_path) { #ifdef NDEBUG bool silent = true; #else bool silent = false; #endif - ggml_backend_load_best("blas", silent); - ggml_backend_load_best("cann", silent); - ggml_backend_load_best("cuda", silent); - ggml_backend_load_best("hip", silent); - ggml_backend_load_best("kompute", silent); - ggml_backend_load_best("metal", silent); - ggml_backend_load_best("rpc", silent); - ggml_backend_load_best("sycl", silent); - ggml_backend_load_best("vulkan", silent); - ggml_backend_load_best("musa", silent); - ggml_backend_load_best("cpu", silent); + ggml_backend_load_best("blas", silent, dir_path); + ggml_backend_load_best("cann", silent, dir_path); + ggml_backend_load_best("cuda", silent, dir_path); + ggml_backend_load_best("hip", silent, dir_path); + ggml_backend_load_best("kompute", silent, dir_path); + ggml_backend_load_best("metal", silent, dir_path); + ggml_backend_load_best("rpc", silent, dir_path); + ggml_backend_load_best("sycl", silent, dir_path); + ggml_backend_load_best("vulkan", silent, dir_path); + ggml_backend_load_best("musa", silent, dir_path); + ggml_backend_load_best("cpu", silent, dir_path); } From 4b4d92b0986e3b627b9a9ef4782973108bf47691 Mon Sep 17 00:00:00 2001 From: CentricStorm Date: Wed, 11 Dec 2024 10:47:43 +0000 Subject: [PATCH 43/99] docs: fix server documentation formatting (#10776) --- examples/server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/README.md b/examples/server/README.md index 6294f541f..2bd23d1a6 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -442,7 +442,7 @@ These words will not be included in the completion, so make sure to add them to `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. - `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` **Response format** From 484d2f31aed34ff9f096e3961125762e81d9b7d6 Mon Sep 17 00:00:00 2001 From: kallewoof Date: Wed, 11 Dec 2024 22:48:04 +0900 Subject: [PATCH 44/99] bug-fix: snprintf prints NULL in place of the last character (#10419) * bug-fix: snprintf prints NULL in place of the last character We need to give snprintf enough space to print the last character and the null character, thus we allocate one extra byte and then ignore it when converting to std::string. * add comment about extra null-term byte requirement --- examples/server/utils.hpp | 2 +- include/llama.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f545aea5..2fcb895ab 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -333,7 +333,7 @@ static std::string llama_get_chat_template(const struct llama_model * model) { if (res < 2) { return ""; } else { - std::vector model_template(res, 0); + std::vector model_template(res + 1, 0); llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); return std::string(model_template.data(), model_template.size() - 1); } diff --git a/include/llama.h b/include/llama.h index 36945cde3..eebbacb80 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,6 +456,7 @@ extern "C" { // Functions to access the model's GGUF metadata scalar values // - The functions return the length of the string on success, or -1 on failure // - The output string is always null-terminated and cleared on failure + // - When retrieving a string, an extra byte must be allocated to account for the null terminator // - GGUF array values are not supported by these functions // Get metadata value as a string by key name From 92f77a640f763c0af73554fb810a85a7d4c85e5e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 14:59:41 +0100 Subject: [PATCH 45/99] ci : pin nodejs to 22.11.0 (#10779) --- .github/workflows/server.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 9e66fb68c..671fe595c 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -79,7 +79,7 @@ jobs: # Setup nodejs (to be used for verifying bundled index.html) - uses: actions/setup-node@v4 with: - node-version: 22 + node-version: '22.11.0' - name: Verify bundled index.html id: verify_server_index_html From 1a31d0dc00ba946d448e16ecc915ce5e8355994e Mon Sep 17 00:00:00 2001 From: qingy1337 Date: Wed, 11 Dec 2024 07:16:32 -0800 Subject: [PATCH 46/99] Update README.md (#10772) --- examples/quantize/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index 5d1e11c67..f9cce7b21 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930) - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957) - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969) - - [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996) + - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996) - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060) - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196) - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361) From 235f6e14bf0ed0211c51aeff14139038ae1000aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 11 Dec 2024 20:52:14 +0100 Subject: [PATCH 47/99] server : (UI) add tok/s, get rid of completion.js (#10786) * get rid of completion.js * extract chat bubble to a component * add tok/s info * sync * fix BASE_URL * only extract timings when it's enabled * fix auto scroll --- examples/server/public/index.html | 198 ++++++++++++--------- examples/server/webui/index.html | 124 ++++++++----- examples/server/webui/package-lock.json | 7 + examples/server/webui/package.json | 1 + examples/server/webui/src/completion.js | 225 ------------------------ examples/server/webui/src/main.js | 128 +++++++++++--- 6 files changed, 307 insertions(+), 376 deletions(-) delete mode 100644 examples/server/webui/src/completion.js diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 250729a44..9a19c5e83 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -11,84 +11,84 @@ 🦙 llama.cpp - chat - - + @@ -99,7 +99,7 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
-
+

Conversations

@@ -204,51 +204,25 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au( {{ messages.length === 0 ? 'Send a message to start' : '' }}
-
-
- - - - -
-
- - -
- - - - - -
+
-
-
- - -
+
+
@@ -311,6 +285,10 @@ Server rendered element contains fewer child nodes than client vdom.`),T=!0),au(
Advanced config
+
+ + Show tokens per second +
+ + + + +