mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 10:54:36 +00:00
Compare commits
25 Commits
e76326e3e0
...
fa2f378326
Author | SHA1 | Date | |
---|---|---|---|
|
fa2f378326 | ||
|
9ba399dfa7 | ||
|
2cd43f4900 | ||
|
09fe2e7613 | ||
|
30caac3a68 | ||
|
60cfa728e2 | ||
|
3327bb0f8d | ||
|
24bad77ebf | ||
|
f91cf62b89 | ||
|
bc93d2a44e | ||
|
6e9fdb0b52 | ||
|
3b409c1e92 | ||
|
82cbfda7b9 | ||
|
5ff563257c | ||
|
692880535a | ||
|
816d93db75 | ||
|
6c50e9caca | ||
|
7d80a4aa97 | ||
|
55a6f951ca | ||
|
3b27041727 | ||
|
ae41d3efed | ||
|
6fc90cb727 | ||
|
4f696624a4 | ||
|
a249dc0fbb | ||
|
677058f470 |
@ -91,7 +91,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
|
||||
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
|
||||
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b) + [GLMEdge-1.5b](https://huggingface.co/THUDM/glm-edge-1.5b-chat) + [GLMEdge-4b](https://huggingface.co/THUDM/glm-edge-4b-chat)
|
||||
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
|
||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||
@ -111,6 +111,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
|
||||
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
|
||||
|
||||
</details>
|
||||
|
@ -642,7 +642,7 @@ class Model:
|
||||
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
|
||||
res = "jina-v2-code"
|
||||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
|
||||
if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-chat
|
||||
res = "chatglm-bpe"
|
||||
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
|
||||
@ -4280,7 +4280,7 @@ class JaisModel(Model):
|
||||
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
|
||||
|
||||
|
||||
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
|
||||
class ChatGLMModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.CHATGLM
|
||||
|
||||
@ -4386,47 +4386,56 @@ class ChatGLMModel(Model):
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["padded_vocab_size"]
|
||||
vocab_size = hparams.get("padded_vocab_size",hparams["vocab_size"])
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
if(hparams["partial_rotary_factor"] == 1.0):
|
||||
# only for glm-edge series
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
||||
else:
|
||||
# for glm4 series
|
||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer._mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) >= 2 and len(merged) <= 7
|
||||
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
||||
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) >= 2 and len(merged) <= 7
|
||||
merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.get_added_vocab()
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
tokens.append(f"[PAD{i}]")
|
||||
toktypes.append(gguf.TokenType.UNUSED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
if tokenizer.added_tokens_decoder[i].special:
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_tokenizer_pre(tokpre)
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
# only add special tokens when they were not already loaded from config.json
|
||||
special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
|
||||
@ -4437,16 +4446,20 @@ class ChatGLMModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_head_kv = self.hparams.get("multi_query_group_num", n_head)
|
||||
n_head_kv = self.hparams.get("multi_query_group_num", self.hparams.get("num_key_value_heads", n_head))
|
||||
self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
|
||||
self.gguf_writer.add_embedding_length(n_embed)
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
|
||||
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", self.hparams.get("intermediate_size", 4 * n_embed)))
|
||||
self.gguf_writer.add_block_count(self.hparams.get("num_layers", self.hparams["num_hidden_layers"]))
|
||||
self.gguf_writer.add_head_count(n_head)
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon",1e-5))
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_rope_dimension_count(64)
|
||||
if "attention_dim" in self.hparams:
|
||||
rope_dim = self.hparams["attention_dim"]
|
||||
else:
|
||||
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
||||
self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
|
||||
self.gguf_writer.add_add_bos_token(False)
|
||||
rope_freq = 10000
|
||||
if "rope_ratio" in self.hparams:
|
||||
@ -4456,7 +4469,7 @@ class ChatGLMModel(Model):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
if name.endswith(".rotary_pos_emb.inv_freq"):
|
||||
if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."):
|
||||
return []
|
||||
|
||||
name = name.removeprefix("transformer.")
|
||||
|
43
examples/llava/README-glmedge.md
Normal file
43
examples/llava/README-glmedge.md
Normal file
@ -0,0 +1,43 @@
|
||||
# GLMV-EDGE
|
||||
|
||||
Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llama-llava-cli` to build it.
|
||||
|
||||
After building, run: `./llama-llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
|
||||
```
|
||||
|
||||
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
|
||||
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
|
||||
|
||||
## GGUF conversion
|
||||
|
||||
1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
|
||||
|
||||
```sh
|
||||
git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
|
||||
```
|
||||
|
||||
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-surgery.py -m ../model_path
|
||||
```
|
||||
|
||||
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
|
||||
```
|
||||
|
||||
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
|
||||
|
||||
```sh
|
||||
python convert_hf_to_gguf.py ../model_path
|
||||
```
|
||||
|
||||
Now both the LLM part and the image encoder are in the `model_path` directory.
|
@ -101,6 +101,7 @@ static std::string format(const char * fmt, ...) {
|
||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||
#define KEY_HAS_GLM_PROJ "clip.has_glm_projector"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_HAS_QWEN2VL_MERGER "clip.has_qwen2vl_merger"
|
||||
#define KEY_USE_GELU "clip.use_gelu"
|
||||
@ -159,6 +160,15 @@ static std::string format(const char * fmt, ...) {
|
||||
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||
|
||||
#define TN_GLM_ADAPER_CONV "adapter.conv.%s"
|
||||
#define TN_GLM_ADAPTER_LINEAR "adapter.linear.linear.%s"
|
||||
#define TN_GLM_ADAPTER_NORM_1 "adapter.linear.norm1.%s"
|
||||
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
|
||||
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
|
||||
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
|
||||
#define TN_GLM_BOI_W "adapter.boi"
|
||||
#define TN_GLM_EOI_W "adapter.eoi"
|
||||
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
@ -166,6 +176,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_LDPV2,
|
||||
PROJECTOR_TYPE_RESAMPLER,
|
||||
PROJECTOR_TYPE_ADAPTER,
|
||||
PROJECTOR_TYPE_MERGER,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
@ -175,6 +186,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||
{ PROJECTOR_TYPE_ADAPTER, "adapter"},
|
||||
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
|
||||
};
|
||||
|
||||
@ -499,6 +511,12 @@ struct clip_vision_model {
|
||||
struct ggml_tensor * mm_4_w = NULL;
|
||||
struct ggml_tensor * mm_4_b = NULL;
|
||||
|
||||
//GLMV-Edge projection
|
||||
struct ggml_tensor * mm_model_adapter_conv_w;
|
||||
struct ggml_tensor * mm_model_adapter_conv_b;
|
||||
struct ggml_tensor * boi_w;
|
||||
struct ggml_tensor * eoi_w;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor * mm_model_mlp_1_w;
|
||||
struct ggml_tensor * mm_model_mlp_1_b;
|
||||
@ -559,6 +577,7 @@ struct clip_ctx {
|
||||
bool has_vision_encoder = false;
|
||||
bool has_llava_projector = false;
|
||||
bool has_minicpmv_projector = false;
|
||||
bool has_glm_projector = false;
|
||||
bool has_qwen2vl_merger = false;
|
||||
int minicpmv_version = 2;
|
||||
|
||||
@ -637,7 +656,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
|
||||
const int batch_size = imgs->size;
|
||||
|
||||
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
||||
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
|
||||
@ -730,8 +749,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
}
|
||||
|
||||
// loop over layers
|
||||
if (ctx->has_minicpmv_projector || ctx->has_qwen2vl_merger) {
|
||||
// TODO: figure out why we doing thing in this way ???
|
||||
if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
|
||||
n_layer += 1;
|
||||
}
|
||||
for (int il = 0; il < n_layer - 1; il++) {
|
||||
@ -1086,7 +1104,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
// glm projector
|
||||
else if(ctx->has_glm_projector){
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_ADAPTER){
|
||||
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
||||
embeddings = ggml_reshape_3d(ctx0,embeddings,gridsz,gridsz,embeddings->ne[1]);
|
||||
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||
//GLU
|
||||
{
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||
struct ggml_tensor * x = embeddings;
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||
x = ggml_mul_mat(ctx0,model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_silu_inplace(ctx0,embeddings);
|
||||
embeddings = ggml_mul(ctx0,embeddings,x);
|
||||
embeddings = ggml_mul_mat(ctx0,model.mm_model_mlp_3_w,embeddings);
|
||||
}
|
||||
}else{
|
||||
GGML_ABORT("fatel error");
|
||||
}
|
||||
}else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
@ -1275,6 +1319,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
|
||||
}
|
||||
|
||||
idx = gguf_find_key(ctx, KEY_HAS_GLM_PROJ);
|
||||
if (idx != -1) {
|
||||
new_clip->has_glm_projector = gguf_get_val_bool(ctx, idx);
|
||||
}
|
||||
|
||||
idx = gguf_find_key(ctx, KEY_HAS_QWEN2VL_MERGER);
|
||||
if (idx != -1) {
|
||||
new_clip->has_qwen2vl_merger = gguf_get_val_bool(ctx, idx);
|
||||
@ -1299,6 +1348,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||
LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector);
|
||||
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
@ -1566,6 +1616,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||
}
|
||||
else if(new_clip->proj_type == PROJECTOR_TYPE_ADAPTER){
|
||||
vision_model.mm_model_adapter_conv_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "weight"));
|
||||
vision_model.mm_model_adapter_conv_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPER_CONV, "bias"));
|
||||
vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_LINEAR,"weight"));
|
||||
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"weight"));
|
||||
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_NORM_1,"bias"));
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
|
||||
vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_GATE,"weight"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
|
||||
vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W);
|
||||
vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W);
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
@ -2098,6 +2160,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
return true;
|
||||
}
|
||||
|
||||
if(ctx->has_glm_projector){
|
||||
res_imgs->size = 1;
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
clip_image_u8 resized_image;
|
||||
int32_t sz=ctx->vision_model.hparams.image_size;
|
||||
bicubic_resize(*img, resized_image,sz,sz);
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
//clip_image_save_to_bmp(resized_image, "resized.bmp");
|
||||
normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
|
||||
res_imgs->data[0] = *res;
|
||||
clip_image_f32_free(res);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
||||
@ -2283,6 +2359,8 @@ void clip_free(clip_ctx * ctx) {
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
if(ctx->has_glm_projector)
|
||||
return (clip_n_patches(ctx)+2) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
|
||||
}
|
||||
|
||||
@ -2325,7 +2403,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
|
||||
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_ADAPTER) {
|
||||
n_patches /= 4;
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||
if (ctx->minicpmv_version == 2) {
|
||||
@ -2455,6 +2533,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
}
|
||||
if(ctx->has_glm_projector){
|
||||
GGML_ASSERT(batch_size == 1);
|
||||
ggml_tensor * boi = ctx->vision_model.boi_w;
|
||||
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
|
||||
vec=(float*)(vec+ggml_nelements(boi)); //offset for boi
|
||||
}
|
||||
|
||||
// build the inference graph
|
||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
|
||||
@ -2604,7 +2688,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||
free(positions_data);
|
||||
|
||||
{
|
||||
if (!ctx->has_glm_projector){
|
||||
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
|
||||
int* patches_data = (int*)malloc(ggml_nbytes(patches));
|
||||
for (int i = 0; i < num_patches; i++) {
|
||||
@ -2628,6 +2712,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
// copy the embeddings to the location passed by the user
|
||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||
|
||||
if(ctx->has_glm_projector){
|
||||
//eoi
|
||||
ggml_tensor * eoi = ctx->vision_model.eoi_w;
|
||||
int offset=ggml_nelements(eoi)*clip_n_patches(ctx);
|
||||
ggml_backend_tensor_get(eoi,vec+offset,0,ggml_nbytes(eoi));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -2785,6 +2876,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return 3584;
|
||||
}
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_ADAPTER){
|
||||
return ctx->vision_model.mm_model_mlp_3_w->ne[1];
|
||||
}
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
|
||||
return ctx->vision_model.mm_1_b->ne[0];
|
||||
}
|
||||
@ -2800,6 +2894,9 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool clip_is_glm(const struct clip_ctx * ctx) {
|
||||
return ctx->has_glm_projector;
|
||||
}
|
||||
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
|
||||
return ctx->has_qwen2vl_merger;
|
||||
}
|
||||
|
@ -93,6 +93,8 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
|
||||
|
||||
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
|
||||
|
||||
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
280
examples/llava/glmedge-convert-image-encoder-to-gguf.py
Normal file
280
examples/llava/glmedge-convert-image-encoder-to-gguf.py
Normal file
@ -0,0 +1,280 @@
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from gguf import *
|
||||
|
||||
TEXT = "clip.text"
|
||||
VISION = "clip.vision"
|
||||
from transformers import SiglipVisionModel, SiglipVisionConfig
|
||||
|
||||
def k(raw_key: str, arch: str) -> str:
|
||||
return raw_key.format(arch=arch)
|
||||
|
||||
|
||||
def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
|
||||
if name in (
|
||||
"logit_scale",
|
||||
"text_model.embeddings.position_ids",
|
||||
"vision_model.embeddings.position_ids",
|
||||
):
|
||||
return True
|
||||
|
||||
if name in (
|
||||
"vision_model.head.probe",
|
||||
"vision_model.head.attention.in_proj_weight",
|
||||
"vision_model.head.attention.in_proj_bias",
|
||||
"vision_model.head.attention.out_proj.weight",
|
||||
"vision_model.head.attention.out_proj.bias",
|
||||
"vision_model.head.layernorm.weight",
|
||||
"vision_model.head.layernorm.bias",
|
||||
"vision_model.head.mlp.fc1.weight",
|
||||
"vision_model.head.mlp.fc1.bias",
|
||||
"vision_model.head.mlp.fc2.weight",
|
||||
"vision_model.head.mlp.fc2.bias"
|
||||
):
|
||||
return True
|
||||
|
||||
if name.startswith("v") and not has_vision:
|
||||
return True
|
||||
|
||||
if name.startswith("t") and not has_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_tensor_name(name: str) -> str:
|
||||
if "projection" in name:
|
||||
return name
|
||||
if "mm_projector" in name:
|
||||
name = name.replace("model.mm_projector", "mm")
|
||||
name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
|
||||
name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
|
||||
return name
|
||||
|
||||
return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
|
||||
ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
|
||||
ap.add_argument("--text-only", action="store_true", required=False,
|
||||
help="Save a text-only model. It can't be used to encode images")
|
||||
ap.add_argument("--vision-only", action="store_true", required=False,
|
||||
help="Save a vision-only model. It can't be used to encode texts")
|
||||
ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
|
||||
help="The clip model is from openclip (for ViT-SO400M type))")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2","adapter"], default="adapter")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
|
||||
# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
|
||||
default_image_mean = [0.5, 0.5, 0.5]
|
||||
default_image_std = [0.5, 0.5, 0.5]
|
||||
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
|
||||
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
|
||||
|
||||
# with proper
|
||||
args = ap.parse_args()
|
||||
|
||||
|
||||
if args.text_only and args.vision_only:
|
||||
print("--text-only and --image-only arguments cannot be specified at the same time.")
|
||||
exit(1)
|
||||
|
||||
if args.use_f32:
|
||||
print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
|
||||
|
||||
# output in the same directory as the model if output_dir is None
|
||||
dir_model = args.model_dir
|
||||
|
||||
if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
|
||||
vocab = None
|
||||
tokens = None
|
||||
else:
|
||||
with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
tokens = [key for key in vocab]
|
||||
|
||||
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
if args.clip_model_is_vision:
|
||||
v_hparams = config
|
||||
t_hparams = None
|
||||
else:
|
||||
v_hparams = config["vision_config"]
|
||||
t_hparams = None
|
||||
|
||||
# possible data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
#
|
||||
# map from ftype to string
|
||||
ftype_str = ["f32", "f16"]
|
||||
|
||||
ftype = 1
|
||||
if args.use_f32:
|
||||
ftype = 0
|
||||
|
||||
vision_config = SiglipVisionConfig(**v_hparams)
|
||||
model = SiglipVisionModel(vision_config)
|
||||
model.load_state_dict(torch.load(os.path.join(dir_model, "glm.clip")))
|
||||
|
||||
fname_middle = None
|
||||
has_text_encoder = False
|
||||
has_vision_encoder = True
|
||||
has_glm_projector = True
|
||||
if args.text_only:
|
||||
fname_middle = "text-"
|
||||
has_vision_encoder = False
|
||||
elif args.llava_projector is not None:
|
||||
fname_middle = "mmproj-"
|
||||
has_text_encoder = False
|
||||
has_glm_projector = True
|
||||
elif args.vision_only:
|
||||
fname_middle = "vision-"
|
||||
has_text_encoder = False
|
||||
else:
|
||||
fname_middle = ""
|
||||
|
||||
output_dir = args.output_dir if args.output_dir is not None else dir_model
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
output_prefix = os.path.basename(output_dir).replace("ggml_", "")
|
||||
fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
|
||||
fout = GGUFWriter(path=fname_out, arch="clip")
|
||||
|
||||
fout.add_bool("clip.has_text_encoder", has_text_encoder)
|
||||
fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
|
||||
fout.add_bool("clip.has_glm_projector", has_glm_projector)
|
||||
fout.add_file_type(ftype)
|
||||
model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
|
||||
fout.add_name(model_name)
|
||||
if has_glm_projector:
|
||||
fout.add_description("image encoder for glm4v")
|
||||
fout.add_string("clip.projector_type", "adapter")
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
if has_text_encoder:
|
||||
assert t_hparams is not None
|
||||
assert tokens is not None
|
||||
# text_model hparams
|
||||
fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
|
||||
fout.add_token_list(tokens)
|
||||
|
||||
if has_vision_encoder:
|
||||
# vision_model hparams
|
||||
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
|
||||
fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
|
||||
fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
|
||||
fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
|
||||
fout.add_uint32("clip.vision.projection_dim", 0)
|
||||
fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
|
||||
fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
|
||||
fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), v_hparams["num_hidden_layers"])
|
||||
|
||||
image_mean = args.image_mean if args.image_mean is not None else default_image_mean
|
||||
image_std = args.image_std if args.image_std is not None else default_image_std
|
||||
fout.add_array("clip.vision.image_mean", image_mean)
|
||||
fout.add_array("clip.vision.image_std", image_std)
|
||||
|
||||
fout.add_bool("clip.use_gelu", True)
|
||||
|
||||
|
||||
if has_glm_projector:
|
||||
# model.vision_model.encoder.layers.pop(-1) # pyright: ignore[reportAttributeAccessIssue]
|
||||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
# pw and dw conv ndim==4
|
||||
if data.ndim == 2 or data.ndim == 4:
|
||||
data = data.squeeze().numpy().astype(np.float16)
|
||||
else:
|
||||
data = data.squeeze().numpy().astype(np.float32)
|
||||
if name.startswith("vision."):
|
||||
name=name.replace("vision.","")
|
||||
fout.add_tensor(name, data)
|
||||
print(f"Projector {name} - {data.dtype} - shape = {data.shape}")
|
||||
# print(f"Projector {name} tensors added\n")
|
||||
|
||||
state_dict = model.state_dict() # pyright: ignore[reportAttributeAccessIssue]
|
||||
for name, data in state_dict.items():
|
||||
if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_glm_projector):
|
||||
# we don't need this
|
||||
print(f"skipping parameter: {name}")
|
||||
continue
|
||||
|
||||
name = get_tensor_name(name)
|
||||
data = data.squeeze().numpy()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
|
||||
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||
ftype_cur = 0
|
||||
if n_dims == 4:
|
||||
print(f"tensor {name} is always saved in f16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
elif ftype == 1:
|
||||
if name[-7:] == ".weight" and n_dims == 2:
|
||||
# print(" Converting to float16")
|
||||
data = data.astype(np.float16)
|
||||
ftype_cur = 1
|
||||
else:
|
||||
# print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
else:
|
||||
if data.dtype != np.float32:
|
||||
# print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
print(f"siglip {name} - {data.dtype} - shape = {data.shape}")
|
||||
# print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
|
||||
fout.add_tensor(name, data)
|
||||
|
||||
|
||||
fout.write_header_to_file()
|
||||
fout.write_kv_data_to_file()
|
||||
fout.write_tensors_to_file()
|
||||
fout.close()
|
||||
|
||||
print("Done. Output file: " + fname_out)
|
33
examples/llava/glmedge-surgery.py
Normal file
33
examples/llava/glmedge-surgery.py
Normal file
@ -0,0 +1,33 @@
|
||||
import argparse
|
||||
import os
|
||||
import torch
|
||||
from transformers import AutoModel
|
||||
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("-m", "--model", help="Path to GLM model")
|
||||
args = ap.parse_args()
|
||||
|
||||
# find the model part that includes the the multimodal projector weights
|
||||
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
|
||||
checkpoint = model.state_dict()
|
||||
|
||||
# get a list of mm tensor names
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.adapter.")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/glm.projector")
|
||||
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision.vit.model.vision_model.")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision.vit.model.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/glm.clip")
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
print("Done!")
|
||||
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
|
||||
print(f"Also, use {args.model}glm.projector to prepare a glm-encoder.gguf file.")
|
@ -314,6 +314,20 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
}
|
||||
else if (clip_is_glm(ctx_clip)){
|
||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||
load_image_size->width = img_res_v.data[0].nx;
|
||||
load_image_size->height = img_res_v.data[0].ny;
|
||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||
|
||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
||||
int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
|
||||
*n_img_pos = (pos * pos + 2);
|
||||
if (!encoded){
|
||||
LOG_ERR("Unable to encode image \n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
||||
// flat / default llava-1.5 type embedding
|
||||
*n_img_pos = clip_n_patches(ctx_clip);
|
||||
@ -398,6 +412,9 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
|
||||
if (clip_is_minicpmv(ctx_clip)) {
|
||||
num_max_patches = 10;
|
||||
}
|
||||
if (clip_is_glm(ctx_clip)) {
|
||||
num_max_patches = 1;
|
||||
}
|
||||
float * image_embd;
|
||||
if (clip_is_qwen2vl(ctx_clip)) {
|
||||
// qwen2vl don't split image into chunks, so `num_max_patches` is not needed.
|
||||
|
@ -34,6 +34,7 @@ endforeach()
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if (LLAMA_SERVER_SSL)
|
||||
|
@ -450,6 +450,8 @@ These words will not be included in the completion, so make sure to add them to
|
||||
|
||||
`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
|
||||
|
||||
`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error.
|
||||
|
||||
**Response format**
|
||||
|
||||
- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
|
@ -92,6 +92,7 @@ struct slot_params {
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
std::vector<std::string> response_fields;
|
||||
bool timings_per_token = false;
|
||||
bool post_sampling_probs = false;
|
||||
bool ignore_eos = false;
|
||||
@ -209,6 +210,7 @@ struct server_task {
|
||||
params.n_discard = json_value(data, "n_discard", defaults.n_discard);
|
||||
//params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
|
||||
params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
|
||||
params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
|
||||
|
||||
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
|
||||
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
|
||||
@ -522,6 +524,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
|
||||
bool post_sampling_probs;
|
||||
std::vector<completion_token_output> probs_output;
|
||||
std::vector<std::string> response_fields;
|
||||
|
||||
slot_params generation_params;
|
||||
|
||||
@ -568,7 +571,7 @@ struct server_task_result_cmpl_final : server_task_result {
|
||||
if (!stream && !probs_output.empty()) {
|
||||
res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs);
|
||||
}
|
||||
return res;
|
||||
return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
|
||||
}
|
||||
|
||||
json to_json_oaicompat_chat() {
|
||||
@ -2066,6 +2069,7 @@ struct server_context {
|
||||
res->tokens = slot.generated_tokens;
|
||||
res->timings = slot.get_timings();
|
||||
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
|
||||
res->response_fields = slot.params.response_fields;
|
||||
|
||||
res->truncated = slot.truncated;
|
||||
res->n_decoded = slot.n_decoded;
|
||||
@ -3786,6 +3790,17 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
bool use_base64 = false;
|
||||
if (body.count("encoding_format") != 0) {
|
||||
const std::string& format = body.at("encoding_format");
|
||||
if (format == "base64") {
|
||||
use_base64 = true;
|
||||
} else if (format != "float") {
|
||||
res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true);
|
||||
for (const auto & tokens : tokenized_prompts) {
|
||||
// this check is necessary for models that do not add BOS token to the input
|
||||
@ -3837,7 +3852,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// write JSON response
|
||||
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : json(responses);
|
||||
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses);
|
||||
res_ok(res, root);
|
||||
};
|
||||
|
||||
|
@ -95,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int):
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": "I believe the meaning of life is",
|
||||
"seed": 42,
|
||||
"temperature": 1.0,
|
||||
"temperature": 0.0,
|
||||
"cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed
|
||||
})
|
||||
if last_res is not None:
|
||||
@ -120,9 +120,10 @@ def test_different_result_different_seed(n_slots: int):
|
||||
assert res.body["content"] != last_res.body["content"]
|
||||
last_res = res
|
||||
|
||||
|
||||
# TODO figure why it don't work with temperature = 1
|
||||
# @pytest.mark.parametrize("temperature", [0.0, 1.0])
|
||||
@pytest.mark.parametrize("n_batch", [16, 32])
|
||||
@pytest.mark.parametrize("temperature", [0.0, 1.0])
|
||||
@pytest.mark.parametrize("temperature", [0.0])
|
||||
def test_consistent_result_different_batch_size(n_batch: int, temperature: float):
|
||||
global server
|
||||
server.n_batch = n_batch
|
||||
@ -257,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int):
|
||||
# assert match_regex(re_content, res.body["content"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,n_predict,response_fields",
|
||||
[
|
||||
("I believe the meaning of life is", 8, []),
|
||||
("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]),
|
||||
],
|
||||
)
|
||||
def test_completion_response_fields(
|
||||
prompt: str, n_predict: int, response_fields: list[str]
|
||||
):
|
||||
global server
|
||||
server.start()
|
||||
res = server.make_request(
|
||||
"POST",
|
||||
"/completion",
|
||||
data={
|
||||
"n_predict": n_predict,
|
||||
"prompt": prompt,
|
||||
"response_fields": response_fields,
|
||||
},
|
||||
)
|
||||
assert res.status_code == 200
|
||||
assert "content" in res.body
|
||||
assert len(res.body["content"])
|
||||
if len(response_fields):
|
||||
assert res.body["generation_settings/n_predict"] == n_predict
|
||||
assert res.body["prompt"] == "<s> " + prompt
|
||||
assert isinstance(res.body["content"], str)
|
||||
assert len(res.body) == len(response_fields)
|
||||
else:
|
||||
assert len(res.body)
|
||||
assert "generation_settings" in res.body
|
||||
|
||||
|
||||
def test_n_probs():
|
||||
global server
|
||||
server.start()
|
||||
|
@ -1,3 +1,5 @@
|
||||
import base64
|
||||
import struct
|
||||
import pytest
|
||||
from openai import OpenAI
|
||||
from utils import *
|
||||
@ -194,3 +196,42 @@ def test_embedding_usage_multiple():
|
||||
assert res.status_code == 200
|
||||
assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
|
||||
assert res.body['usage']['prompt_tokens'] == 2 * 9
|
||||
|
||||
|
||||
def test_embedding_openai_library_base64():
|
||||
server.start()
|
||||
test_input = "Test base64 embedding output"
|
||||
|
||||
# get embedding in default format
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": test_input
|
||||
})
|
||||
assert res.status_code == 200
|
||||
vec0 = res.body["data"][0]["embedding"]
|
||||
|
||||
# get embedding in base64 format
|
||||
res = server.make_request("POST", "/v1/embeddings", data={
|
||||
"input": test_input,
|
||||
"encoding_format": "base64"
|
||||
})
|
||||
|
||||
assert res.status_code == 200
|
||||
assert "data" in res.body
|
||||
assert len(res.body["data"]) == 1
|
||||
|
||||
embedding_data = res.body["data"][0]
|
||||
assert "embedding" in embedding_data
|
||||
assert isinstance(embedding_data["embedding"], str)
|
||||
|
||||
# Verify embedding is valid base64
|
||||
decoded = base64.b64decode(embedding_data["embedding"])
|
||||
# Verify decoded data can be converted back to float array
|
||||
float_count = len(decoded) // 4 # 4 bytes per float
|
||||
floats = struct.unpack(f'{float_count}f', decoded)
|
||||
assert len(floats) > 0
|
||||
assert all(isinstance(x, float) for x in floats)
|
||||
assert len(floats) == len(vec0)
|
||||
|
||||
# make sure the decoded data is the same as the original
|
||||
for x, y in zip(floats, vec0):
|
||||
assert abs(x - y) < EPSILON
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "common/base64.hpp"
|
||||
|
||||
#ifndef NDEBUG
|
||||
// crash the server in debug mode, otherwise send an http 500 error
|
||||
@ -90,6 +91,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// get value by path(key1 / key2)
|
||||
static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
|
||||
json result = json::object();
|
||||
|
||||
for (const std::string & path : paths) {
|
||||
json current = js;
|
||||
const auto keys = string_split<std::string>(path, /*separator*/ '/');
|
||||
bool valid_path = true;
|
||||
for (const std::string & k : keys) {
|
||||
if (valid_path && current.is_object() && current.contains(k)) {
|
||||
current = current[k];
|
||||
} else {
|
||||
valid_path = false;
|
||||
}
|
||||
}
|
||||
if (valid_path) {
|
||||
result[path] = current;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* this handles 2 cases:
|
||||
* - only string, example: "string"
|
||||
@ -591,16 +614,31 @@ static json oaicompat_completion_params_parse(
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
|
||||
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) {
|
||||
json data = json::array();
|
||||
int32_t n_tokens = 0;
|
||||
int i = 0;
|
||||
for (const auto & elem : embeddings) {
|
||||
data.push_back(json{
|
||||
{"embedding", json_value(elem, "embedding", json::array())},
|
||||
{"index", i++},
|
||||
{"object", "embedding"}
|
||||
});
|
||||
json embedding_obj;
|
||||
|
||||
if (use_base64) {
|
||||
const auto& vec = json_value(elem, "embedding", json::array()).get<std::vector<float>>();
|
||||
const char* data_ptr = reinterpret_cast<const char*>(vec.data());
|
||||
size_t data_size = vec.size() * sizeof(float);
|
||||
embedding_obj = {
|
||||
{"embedding", base64::encode(data_ptr, data_size)},
|
||||
{"index", i++},
|
||||
{"object", "embedding"},
|
||||
{"encoding_format", "base64"}
|
||||
};
|
||||
} else {
|
||||
embedding_obj = {
|
||||
{"embedding", json_value(elem, "embedding", json::array())},
|
||||
{"index", i++},
|
||||
{"object", "embedding"}
|
||||
};
|
||||
}
|
||||
data.push_back(embedding_obj);
|
||||
|
||||
n_tokens += json_value(elem, "tokens_evaluated", 0);
|
||||
}
|
||||
|
@ -234,6 +234,7 @@ function(ggml_add_backend_library backend)
|
||||
# write the shared library to the output directory
|
||||
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
||||
add_dependencies(ggml ${backend})
|
||||
else()
|
||||
add_library(${backend} ${ARGN})
|
||||
target_link_libraries(ggml PUBLIC ${backend})
|
||||
|
@ -66,6 +66,26 @@
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic push
|
||||
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||
#endif
|
||||
|
||||
static std::wstring utf8_to_utf16(const std::string & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.from_bytes(str);
|
||||
}
|
||||
|
||||
static std::string utf16_to_utf8(const std::wstring & str) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return converter.to_bytes(str);
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
# pragma clang diagnostic pop
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
using dl_handle = std::remove_pointer_t<HMODULE>;
|
||||
@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
static dl_handle * dl_load_library(const std::string & path) {
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
|
||||
return dl_load_library(converter.from_bytes(path));
|
||||
}
|
||||
|
||||
static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
|
||||
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
|
||||
@ -114,8 +129,8 @@ struct dl_handle_deleter {
|
||||
}
|
||||
};
|
||||
|
||||
static void * dl_load_library(const std::string & path) {
|
||||
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
static void * dl_load_library(const std::wstring & path) {
|
||||
dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
|
||||
|
||||
return handle;
|
||||
}
|
||||
@ -202,11 +217,11 @@ struct ggml_backend_registry {
|
||||
devices.push_back(device);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t load_backend(const char * path, bool silent) {
|
||||
ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -214,7 +229,7 @@ struct ggml_backend_registry {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn && score_fn() == 0) {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
|
||||
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -222,7 +237,7 @@ struct ggml_backend_registry {
|
||||
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
|
||||
if (!backend_init_fn) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@ -231,16 +246,16 @@ struct ggml_backend_registry {
|
||||
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
|
||||
if (!silent) {
|
||||
if (!reg) {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
|
||||
} else {
|
||||
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
|
||||
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
__func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
|
||||
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
|
||||
|
||||
register_backend(reg, std::move(handle));
|
||||
|
||||
@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) {
|
||||
|
||||
// Dynamic loading
|
||||
ggml_backend_reg_t ggml_backend_load(const char * path) {
|
||||
return get_reg().load_backend(path, false);
|
||||
return get_reg().load_backend(utf8_to_utf16(path), false);
|
||||
}
|
||||
|
||||
void ggml_backend_unload(ggml_backend_reg_t reg) {
|
||||
get_reg().unload_backend(reg, true);
|
||||
}
|
||||
|
||||
static std::string get_executable_path() {
|
||||
static std::wstring get_executable_path() {
|
||||
#if defined(__APPLE__)
|
||||
// get executable path
|
||||
std::vector<char> path;
|
||||
@ -401,7 +416,7 @@ static std::string get_executable_path() {
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + "/";
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(__linux__) || defined(__FreeBSD__)
|
||||
std::string base_path = ".";
|
||||
std::vector<char> path(1024);
|
||||
@ -427,57 +442,63 @@ static std::string get_executable_path() {
|
||||
path.resize(path.size() * 2);
|
||||
}
|
||||
|
||||
return base_path + "/";
|
||||
return utf8_to_utf16(base_path + "/");
|
||||
#elif defined(_WIN32)
|
||||
std::vector<char> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
|
||||
std::vector<wchar_t> path(MAX_PATH);
|
||||
DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
|
||||
if (len == 0) {
|
||||
return "";
|
||||
return {};
|
||||
}
|
||||
std::string base_path(path.data(), len);
|
||||
std::wstring base_path(path.data(), len);
|
||||
// remove executable name
|
||||
auto last_slash = base_path.find_last_of('\\');
|
||||
if (last_slash != std::string::npos) {
|
||||
base_path = base_path.substr(0, last_slash);
|
||||
}
|
||||
return base_path + "\\";
|
||||
return base_path + L"\\";
|
||||
#else
|
||||
return {};
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_prefix() {
|
||||
static std::wstring backend_filename_prefix() {
|
||||
#ifdef _WIN32
|
||||
return "ggml-";
|
||||
return L"ggml-";
|
||||
#else
|
||||
return "libggml-";
|
||||
return L"libggml-";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::string backend_filename_suffix() {
|
||||
static std::wstring backend_filename_suffix() {
|
||||
#ifdef _WIN32
|
||||
return ".dll";
|
||||
return L".dll";
|
||||
#else
|
||||
return ".so";
|
||||
return L".so";
|
||||
#endif
|
||||
}
|
||||
|
||||
static std::wstring path_separator() {
|
||||
#ifdef _WIN32
|
||||
return L"\\";
|
||||
#else
|
||||
return L"/";
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
|
||||
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
|
||||
// TODO: search system paths
|
||||
std::string file_prefix = backend_filename_prefix() + name + "-";
|
||||
std::vector<std::string> search_paths;
|
||||
std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
|
||||
std::vector<std::wstring> search_paths;
|
||||
if (user_search_path == nullptr) {
|
||||
search_paths.push_back("./");
|
||||
search_paths.push_back(L"." + path_separator());
|
||||
search_paths.push_back(get_executable_path());
|
||||
} else {
|
||||
#if defined(_WIN32)
|
||||
search_paths.push_back(std::string(user_search_path) + "\\");
|
||||
#else
|
||||
search_paths.push_back(std::string(user_search_path) + "/");
|
||||
#endif
|
||||
search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
std::string best_path;
|
||||
std::wstring best_path;
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
for (const auto & search_path : search_paths) {
|
||||
@ -487,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
std::string filename = entry.path().filename().string();
|
||||
std::string ext = entry.path().extension().string();
|
||||
std::wstring filename = entry.path().filename().wstring();
|
||||
std::wstring ext = entry.path().extension().wstring();
|
||||
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
|
||||
dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
if (score_fn) {
|
||||
int s = score_fn();
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
|
||||
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
|
||||
#endif
|
||||
if (s > best_score) {
|
||||
best_score = s;
|
||||
best_path = entry.path().string();
|
||||
best_path = entry.path().wstring();
|
||||
}
|
||||
} else {
|
||||
if (!silent) {
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
|
||||
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -519,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (best_score == 0) {
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
|
||||
std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path.c_str(), silent);
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return get_reg().load_backend(best_path.c_str(), silent);
|
||||
return get_reg().load_backend(best_path, silent);
|
||||
}
|
||||
|
||||
void ggml_backend_load_all() {
|
||||
|
@ -135,14 +135,20 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
|
||||
# show enabled features
|
||||
if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
|
||||
set(FEAT_INPUT_FILE "NUL")
|
||||
else()
|
||||
set(FEAT_INPUT_FILE "/dev/null")
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
|
||||
INPUT_FILE "/dev/null"
|
||||
INPUT_FILE ${FEAT_INPUT_FILE}
|
||||
OUTPUT_VARIABLE ARM_FEATURE
|
||||
RESULT_VARIABLE ARM_FEATURE_RESULT
|
||||
)
|
||||
if (ARM_FEATURE_RESULT)
|
||||
message(FATAL_ERROR "Failed to get ARM features")
|
||||
message(WARNING "Failed to get ARM features")
|
||||
else()
|
||||
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
|
||||
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
|
||||
@ -317,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
|
||||
|
||||
if (GGML_BACKEND_DL)
|
||||
if (GGML_NATIVE)
|
||||
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
|
||||
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
|
||||
endif()
|
||||
|
||||
# The feature detection code is compiled as a separate target so that
|
||||
# it can be built without the architecture flags
|
||||
# Since multiple variants of the CPU backend may be included in the same
|
||||
|
@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat(
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(params,
|
||||
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||
nb11/ggml_type_size(src1->type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
@ -7471,14 +7471,14 @@ UseGgmlGemm1:;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(params,
|
||||
ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
|
@ -53,6 +53,8 @@
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
|
||||
#include <atomic>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#else
|
||||
@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
|
||||
return _mm512_fmadd_ps(a, b, c);
|
||||
}
|
||||
#endif
|
||||
#if defined(__AVX512BF16__)
|
||||
template <>
|
||||
inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
|
||||
return _mm512_dpbf16_ps(c, a, b);
|
||||
}
|
||||
template <>
|
||||
inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
|
||||
return _mm256_dpbf16_ps(c, a, b);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
@ -226,6 +238,13 @@ template <> inline __m256 load(const float *p) {
|
||||
}
|
||||
#endif // __AVX__
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
template <> inline __m256 load(const ggml_bf16_t *p) {
|
||||
return _mm256_castsi256_ps(
|
||||
_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX2__
|
||||
|
||||
#if defined(__F16C__)
|
||||
template <> inline __m256 load(const ggml_fp16_t *p) {
|
||||
return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
|
||||
@ -239,8 +258,27 @@ template <> inline __m512 load(const float *p) {
|
||||
template <> inline __m512 load(const ggml_fp16_t *p) {
|
||||
return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
|
||||
}
|
||||
template <> inline __m512 load(const ggml_bf16_t *p) {
|
||||
return _mm512_castsi512_ps(
|
||||
_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
|
||||
}
|
||||
#endif // __AVX512F__
|
||||
|
||||
#if defined(__AVX512BF16__)
|
||||
template <> inline __m512bh load(const ggml_bf16_t *p) {
|
||||
return (__m512bh)_mm512_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m256bh load(const ggml_bf16_t *p) {
|
||||
return (__m256bh)_mm256_loadu_ps((const float *)p);
|
||||
}
|
||||
template <> inline __m512bh load(const float *p) {
|
||||
return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
|
||||
}
|
||||
template <> inline __m256bh load(const float *p) {
|
||||
return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// CONSTANTS
|
||||
|
||||
@ -252,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// FLOATING POINT MATRIX MULTIPLICATION
|
||||
|
||||
template <int M>
|
||||
static inline int64_t BLOCK_SIZE(size_t m) {
|
||||
const int64_t NB_BLOC_M = (m + M - 1) / M;
|
||||
return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1;
|
||||
}
|
||||
|
||||
static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) {
|
||||
return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1);
|
||||
}
|
||||
|
||||
template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
|
||||
class tinyBLAS {
|
||||
public:
|
||||
tinyBLAS(int64_t k,
|
||||
tinyBLAS(const ggml_compute_params * params, int64_t k,
|
||||
const TA *A, int64_t lda,
|
||||
const TB *B, int64_t ldb,
|
||||
TC *C, int64_t ldc,
|
||||
int ith, int nth)
|
||||
: A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
|
||||
TC *C, int64_t ldc)
|
||||
: params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
mnpack(0, m, 0, n);
|
||||
bool matmul(int64_t m, int64_t n) {
|
||||
if (k % KN != 0)
|
||||
return false;
|
||||
// compute RM for only need tile with size RM&RM-1
|
||||
#if VECTOR_REGISTERS == 32
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 4>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 2>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<6>(n);
|
||||
mnpack<4, 6, 1>(m, n, SIZE_N, 12);
|
||||
return true;
|
||||
}
|
||||
#else // VECTOR_REGISTERS == 16
|
||||
if (m % 16 == 0 && (m/16 >= params->nth)) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 4>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 8 == 0 ) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 2>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
if (m % 4 == 0) {
|
||||
const int64_t SIZE_N = BLOCK_SIZE<3>(n);
|
||||
mnpack<4, 3, 1>(m, n, SIZE_N, 24);
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
private:
|
||||
NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t mc, nc, mp, np;
|
||||
switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
|
||||
#if VECTOR_REGISTERS == 32
|
||||
case 0x55:
|
||||
mc = 5;
|
||||
nc = 5;
|
||||
gemm<5, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x45:
|
||||
mc = 4;
|
||||
nc = 5;
|
||||
gemm<4, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x54:
|
||||
mc = 5;
|
||||
nc = 4;
|
||||
gemm<5, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x44:
|
||||
mc = 4;
|
||||
nc = 4;
|
||||
gemm<4, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x53:
|
||||
mc = 5;
|
||||
nc = 3;
|
||||
gemm<5, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x35:
|
||||
mc = 3;
|
||||
nc = 5;
|
||||
gemm<3, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
#else
|
||||
case 0x55:
|
||||
case 0x54:
|
||||
case 0x53:
|
||||
case 0x45:
|
||||
case 0x44:
|
||||
case 0x43:
|
||||
mc = 4;
|
||||
nc = 3;
|
||||
gemm<4, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x35:
|
||||
#endif
|
||||
case 0x34:
|
||||
mc = 3;
|
||||
nc = 4;
|
||||
gemm<3, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x52:
|
||||
mc = 5;
|
||||
nc = 2;
|
||||
gemm<5, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x33:
|
||||
mc = 3;
|
||||
nc = 3;
|
||||
gemm<3, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x25:
|
||||
mc = 2;
|
||||
nc = 5;
|
||||
gemm<2, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x42:
|
||||
mc = 4;
|
||||
nc = 2;
|
||||
gemm<4, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x24:
|
||||
mc = 2;
|
||||
nc = 4;
|
||||
gemm<2, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x32:
|
||||
mc = 3;
|
||||
nc = 2;
|
||||
gemm<3, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x23:
|
||||
mc = 2;
|
||||
nc = 3;
|
||||
gemm<2, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x51:
|
||||
mc = 5;
|
||||
nc = 1;
|
||||
gemm<5, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x41:
|
||||
mc = 4;
|
||||
nc = 1;
|
||||
gemm<4, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x22:
|
||||
mc = 2;
|
||||
nc = 2;
|
||||
gemm<2, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x15:
|
||||
mc = 1;
|
||||
nc = 5;
|
||||
gemm<1, 5>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x14:
|
||||
mc = 1;
|
||||
nc = 4;
|
||||
gemm<1, 4>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x31:
|
||||
mc = 3;
|
||||
nc = 1;
|
||||
gemm<3, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x13:
|
||||
mc = 1;
|
||||
nc = 3;
|
||||
gemm<1, 3>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x21:
|
||||
mc = 2;
|
||||
nc = 1;
|
||||
gemm<2, 1>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x12:
|
||||
mc = 1;
|
||||
nc = 2;
|
||||
gemm<1, 2>(m0, m, n0, n);
|
||||
break;
|
||||
case 0x11:
|
||||
mc = 1;
|
||||
nc = 1;
|
||||
gemm<1, 1>(m0, m, n0, n);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
template <int RM, int RN, int BM>
|
||||
inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) {
|
||||
if (SIZE_N == RN) {
|
||||
return gemm<RM, RN, BM>(m, n, BN);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
|
||||
} else {
|
||||
GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
|
||||
GGML_ASSERT(false); // we have miss something.
|
||||
}
|
||||
mp = m0 + (m - m0) / mc * mc;
|
||||
np = n0 + (n - n0) / nc * nc;
|
||||
mnpack(mp, m, n0, np);
|
||||
mnpack(m0, m, np, n);
|
||||
}
|
||||
|
||||
template <int RM, int RN>
|
||||
NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
|
||||
int64_t ytiles = (m - m0) / RM;
|
||||
int64_t xtiles = (n - n0) / RN;
|
||||
int64_t tiles = xtiles * ytiles;
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
D Cv[RN][RM] = {};
|
||||
for (int64_t l = 0; l < k; l += KN)
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
|
||||
load<V>(B + ldb * (jj + j) + l),
|
||||
Cv[j][i]);
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
inline void gemm_bloc(int64_t ii, int64_t jj) {
|
||||
D Cv[RN][RM] = {};
|
||||
for (int64_t l = 0; l < k; l += KN) {
|
||||
// help compiler for op order.
|
||||
if constexpr (RM <= RN) {
|
||||
V Av[RM];
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
Av[i] = load<V>(A + lda * (ii + i) + l);
|
||||
}
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
V Bv = load<V>(B + ldb * (jj + j) + l);
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
Cv[j][i] = madd(Av[i], Bv, Cv[j][i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
V Bv[RN];
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
Bv[j] = load<V>(B + ldb * (jj + j) + l);
|
||||
}
|
||||
for (int64_t i = 0; i < RM; ++i) {
|
||||
V Av = load<V>(A + lda * (ii + i) + l);
|
||||
for (int64_t j = 0; j < RN; ++j) {
|
||||
Cv[j][i] = madd(Av, Bv[j], Cv[j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int64_t j = 0; j < RN; ++j)
|
||||
for (int64_t i = 0; i < RM; ++i)
|
||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||
}
|
||||
|
||||
template <int RM, int RN, int BM>
|
||||
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
|
||||
static std::atomic<int64_t> current_chunk;
|
||||
|
||||
GGML_ASSERT(m % (RM * BM) == 0);
|
||||
const int64_t ytiles = m / (RM * BM);
|
||||
const int64_t xtiles = (n + RN -1) / RN;
|
||||
const int64_t jj_RN = (xtiles - (xtiles * RN - n));
|
||||
|
||||
// "round" bloc_size to "nearest" BN
|
||||
const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN;
|
||||
const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1;
|
||||
const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles));
|
||||
const int64_t nb_job = ytiles * NB_BN;
|
||||
|
||||
if (params->ith == 0) {
|
||||
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
|
||||
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
|
||||
std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
int64_t job = params->ith;
|
||||
while (job < nb_job) {
|
||||
const int64_t ii = (job % ytiles) * RM * BM;
|
||||
const int64_t jb = job / ytiles;
|
||||
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||
const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN);
|
||||
|
||||
const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN);
|
||||
const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN);
|
||||
const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN;
|
||||
|
||||
for (int64_t bi = 0; bi < BM * RM; bi += RM) {
|
||||
int64_t jj = jj0;
|
||||
for (; jj < jj1; jj += RN) {
|
||||
gemm_bloc<RM, RN>(ii + bi, jj);
|
||||
}
|
||||
if constexpr (RN > 1) {
|
||||
for (; jj < jj2; jj += RN - 1) {
|
||||
gemm_bloc<RM, RN-1>(ii + bi, jj);
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
// next step.
|
||||
job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
return;
|
||||
}
|
||||
|
||||
const ggml_compute_params * params;
|
||||
const TA *const A;
|
||||
const TB *const B;
|
||||
TC *const C;
|
||||
@ -452,8 +461,6 @@ class tinyBLAS {
|
||||
const int64_t lda;
|
||||
const int64_t ldb;
|
||||
const int64_t ldc;
|
||||
const int ith;
|
||||
const int nth;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -1657,8 +1664,9 @@ class tinyBLAS_PPC {
|
||||
* @param Ctype is GGML data type of `C`
|
||||
* @return true if this function was able to service the matmul request
|
||||
*/
|
||||
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
||||
const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
|
||||
int64_t ldc, int Atype, int Btype, int Ctype) {
|
||||
|
||||
assert(m >= 0);
|
||||
assert(n >= 0);
|
||||
@ -1666,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
assert(lda >= k);
|
||||
assert(ldb >= k);
|
||||
assert(ldc >= m);
|
||||
assert(nth > 0);
|
||||
assert(ith < nth);
|
||||
assert(params->nth > 0);
|
||||
assert(params->ith < params->nth);
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
if (n < 2)
|
||||
@ -1682,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{
|
||||
tinyBLAS<16, __m512, __m512, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__AVX__) || defined(__AVX2__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{
|
||||
tinyBLAS<8, __m256, __m256, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__ARM_NEON)
|
||||
if (n < 4)
|
||||
return false;
|
||||
if (k % 4)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params,
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
#elif defined(__MMA__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
@ -1720,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const float *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1728,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
#endif
|
||||
}
|
||||
|
||||
case GGML_TYPE_BF16: {
|
||||
#if defined(__AVX512BF16__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__AVX512F__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__AVX2__)
|
||||
if (Btype == GGML_TYPE_BF16) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k,
|
||||
(const ggml_bf16_t *)A, lda,
|
||||
(const ggml_bf16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
#if defined(__AVX512F__)
|
||||
if (k % 16)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||
(const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k,
|
||||
(const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
|
||||
if (n < 8)
|
||||
return false;
|
||||
if (k % 8)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F16)
|
||||
return false;
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
if (Btype == GGML_TYPE_F16) {
|
||||
tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const ggml_fp16_t *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
if (k % 4)
|
||||
return false;
|
||||
if (Btype != GGML_TYPE_F32)
|
||||
return false;
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
if (Btype == GGML_TYPE_F32) {
|
||||
tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params,
|
||||
k, (const ggml_fp16_t *)A, lda,
|
||||
(const float *)B, ldb,
|
||||
(float *)C, ldc};
|
||||
return tb.matmul(m, n);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
case GGML_TYPE_Q8_0: {
|
||||
@ -1792,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
@ -1800,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q8_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1816,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__ARM_FEATURE_DOTPROD)
|
||||
@ -1824,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1840,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_q5_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1856,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
k, (const block_iq4_nl *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
ith, nth};
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
@ -1868,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
return false;
|
||||
}
|
||||
|
||||
(void)params;
|
||||
(void)m;
|
||||
(void)n;
|
||||
(void)k;
|
||||
@ -1877,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||
(void)ldb;
|
||||
(void)C;
|
||||
(void)ldc;
|
||||
(void)ith;
|
||||
(void)nth;
|
||||
(void)Atype;
|
||||
(void)Btype;
|
||||
(void)Ctype;
|
||||
|
@ -5,8 +5,8 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
|
||||
const void *, int64_t, void *, int64_t, int, int,
|
||||
bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t,
|
||||
const void *, int64_t, const void *, int64_t, void *, int64_t,
|
||||
int, int, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -1284,6 +1284,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
|
@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file)
|
||||
cursor = connection.cursor()
|
||||
builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
|
||||
|
||||
commit_short_len = len(builds[0][0])
|
||||
|
||||
try:
|
||||
repo = git.Repo(".", search_parent_directories=True)
|
||||
except git.InvalidGitRepositoryError:
|
||||
@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit):
|
||||
seen_hexsha8 = set()
|
||||
while heap:
|
||||
depth, current_commit = heapq.heappop(heap)
|
||||
current_hexsha8 = commit.hexsha[:8]
|
||||
current_hexsha8 = commit.hexsha[:commit_short_len]
|
||||
if (current_hexsha8,) in builds:
|
||||
return current_hexsha8
|
||||
for parent in commit.parents:
|
||||
parent_hexsha8 = parent.hexsha[:8]
|
||||
parent_hexsha8 = parent.hexsha[:commit_short_len]
|
||||
if parent_hexsha8 not in seen_hexsha8:
|
||||
seen_hexsha8.add(parent_hexsha8)
|
||||
heapq.heappush(heap, (depth + 1, parent))
|
||||
@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit):
|
||||
|
||||
while unvisited:
|
||||
current_commit = unvisited.pop(0)
|
||||
visited.append(current_commit.hexsha[:8])
|
||||
visited.append(current_commit.hexsha[:commit_short_len])
|
||||
for parent in current_commit.parents:
|
||||
if parent.hexsha[:8] not in visited:
|
||||
if parent.hexsha[:commit_short_len] not in visited:
|
||||
unvisited.append(parent)
|
||||
|
||||
return visited
|
||||
@ -169,10 +171,10 @@ def get_commit_name(hexsha8):
|
||||
if repo is None:
|
||||
return hexsha8
|
||||
for h in repo.heads:
|
||||
if h.commit.hexsha[:8] == hexsha8:
|
||||
if h.commit.hexsha[:commit_short_len] == hexsha8:
|
||||
return h.name
|
||||
for t in repo.tags:
|
||||
if t.commit.hexsha[:8] == hexsha8:
|
||||
if t.commit.hexsha[:commit_short_len] == hexsha8:
|
||||
return t.name
|
||||
return hexsha8
|
||||
|
||||
@ -183,13 +185,13 @@ def get_commit_hexsha8(name):
|
||||
return None
|
||||
for h in repo.heads:
|
||||
if h.name == name:
|
||||
return h.commit.hexsha[:8]
|
||||
return h.commit.hexsha[:commit_short_len]
|
||||
for t in repo.tags:
|
||||
if t.name == name:
|
||||
return t.commit.hexsha[:8]
|
||||
return t.commit.hexsha[:commit_short_len]
|
||||
for c in repo.iter_commits("--all"):
|
||||
if c.hexsha[:8] == name[:8]:
|
||||
return c.hexsha[:8]
|
||||
if c.hexsha[:commit_short_len] == name[:commit_short_len]:
|
||||
return c.hexsha[:commit_short_len]
|
||||
return None
|
||||
|
||||
|
||||
|
@ -26,7 +26,7 @@ function has_cmd {
|
||||
}
|
||||
|
||||
if has_cmd wget; then
|
||||
cmd="wget -q --show-progress -c -O %s/%s %s"
|
||||
cmd="wget -q -c -O %s/%s %s"
|
||||
elif has_cmd curl; then
|
||||
cmd="curl -C - -f --output-dir %s -o %s -L %s"
|
||||
else
|
||||
|
@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
|
||||
}
|
||||
|
||||
llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
|
||||
return vocab.special_bos_id;
|
||||
return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id;
|
||||
}
|
||||
|
||||
llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
|
||||
|
@ -45,7 +45,7 @@ struct llama_vocab {
|
||||
id special_unk_id = 0;
|
||||
id special_sep_id = LLAMA_TOKEN_NULL;
|
||||
id special_pad_id = LLAMA_TOKEN_NULL;
|
||||
id special_cls_id = LLAMA_TOKEN_NULL;
|
||||
id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
|
||||
id special_mask_id = LLAMA_TOKEN_NULL;
|
||||
|
||||
id linefeed_id = 13;
|
||||
|
@ -1440,6 +1440,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
@ -1715,6 +1718,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_LLAMA_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_3,
|
||||
LLM_CHAT_TEMPLATE_CHATGML_4,
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
@ -1749,6 +1753,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
||||
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
||||
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
@ -6300,8 +6305,20 @@ static void llm_load_hparams(
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 28: model.type = e_model::MODEL_6B; break;
|
||||
case 40: model.type = e_model::MODEL_9B; break;
|
||||
case 28: {
|
||||
if(hparams.n_head(0)==16){
|
||||
model.type = e_model::MODEL_1_6B;
|
||||
}else{
|
||||
model.type = e_model::MODEL_6B;
|
||||
}
|
||||
}break;
|
||||
case 40:{
|
||||
if(hparams.n_head(0)==24){
|
||||
model.type = e_model::MODEL_4B;
|
||||
}else{
|
||||
model.type = e_model::MODEL_9B;
|
||||
}
|
||||
} break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@ -9434,9 +9451,14 @@ static bool llm_load_tensors(
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
||||
if(model.type == e_model::MODEL_1_6B || model.type == e_model::MODEL_4B){
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
}else{
|
||||
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
||||
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
|
||||
@ -16827,20 +16849,28 @@ struct llm_build_context {
|
||||
struct ggml_tensor * Qcur = nullptr;
|
||||
struct ggml_tensor * Kcur = nullptr;
|
||||
struct ggml_tensor * Vcur = nullptr;
|
||||
if(model.type == e_model::MODEL_1_6B || model.type == e_model::MODEL_4B){
|
||||
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}else{
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
if(model.layers[il].bqkv){
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
}
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
//printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
@ -22921,6 +22951,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
||||
} else if (tmpl_contains("[gMASK]<sop>")) {
|
||||
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
||||
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|assistant|>") && !tmpl_contains("<|end|>") && !tmpl_contains("</s>")) {
|
||||
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
||||
} else if (tmpl_contains(LU8("<用户>"))) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
return LLM_CHAT_TEMPLATE_MINICPM;
|
||||
@ -23204,6 +23236,14 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if(tmpl == LLM_CHAT_TEMPLATE_GLMEDGE){
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
for (auto message : chat) {
|
||||
|
@ -61,6 +61,8 @@ int main(void) {
|
||||
"{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
|
||||
// ChatGLM4
|
||||
u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
|
||||
// GLM-edge
|
||||
"{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>\n",
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
|
||||
// DeepSeek-V2
|
||||
@ -119,6 +121,8 @@ int main(void) {
|
||||
"[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n I am an assistant <|user|>\n Another question<|assistant|>",
|
||||
// ChatGLM4
|
||||
"[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// GLM-Edge
|
||||
"<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
|
||||
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
||||
u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
|
||||
// DeepSeek-V2
|
||||
|
Loading…
Reference in New Issue
Block a user