diff --git a/README.md b/README.md index bb2ca9dbd..cc667f592 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Typically finetunes of the base models below are supported as well. - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) +- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) **HTTP server** diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index f568f470c..e3c9bcd43 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) { #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" #define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" @@ -425,6 +426,7 @@ struct clip_vision_model { // embeddings struct ggml_tensor * class_embedding; struct ggml_tensor * patch_embeddings; + struct ggml_tensor * patch_bias; struct ggml_tensor * position_embeddings; struct ggml_tensor * pre_ln_w; @@ -501,6 +503,11 @@ struct clip_ctx { bool use_gelu = false; int32_t ftype = 1; + bool has_class_embedding = true; + bool has_pre_norm = true; + bool has_post_norm = false; + bool has_patch_bias = false; + struct gguf_context * ctx_gguf; struct ggml_context * ctx_data; @@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); - const int num_positions = num_patches + 1; + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; @@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + if (ctx->has_patch_bias) { + // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + } + // concat class_embeddings and patch_embeddings - struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + struct ggml_tensor * embeddings = inp; + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } ggml_set_name(embeddings, "embeddings"); ggml_set_input(embeddings); - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); @@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); // pre-layernorm - { + if (ctx->has_pre_norm) { embeddings = ggml_norm(ctx0, embeddings, eps); ggml_set_name(embeddings, "pre_ln"); @@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = cur; } + // post-layernorm + if (ctx->has_post_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + // llava projector { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); @@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } + try { + vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + new_clip->has_class_embedding = true; + } catch (const std::exception& e) { + new_clip->has_class_embedding = false; + } + + try { + vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + new_clip->has_pre_norm = true; + } catch (std::exception & e) { + new_clip->has_pre_norm = false; + } + + try { + vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); + vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); + new_clip->has_post_norm = true; + } catch (std::exception & e) { + new_clip->has_post_norm = false; + } + + try { + vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); + new_clip->has_patch_bias = true; + } catch (std::exception & e) { + new_clip->has_patch_bias = false; + } + try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); - vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); - vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); } catch(const std::exception& e) { LOG_TEE("%s: failed to load vision model tensors\n", __func__); }