diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 4a2ca6cc3..391bc8ea5 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -186,7 +186,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"}, - { PROJECTOR_TYPE_ADAPTER, "adapter"} + { PROJECTOR_TYPE_ADAPTER, "adapter"}, { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, }; @@ -1130,7 +1130,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 }else{ GGML_ABORT("fatel error"); } - else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + }else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1627,6 +1627,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); vision_model.boi_w = get_tensor(new_clip->ctx_data, TN_GLM_BOI_W); vision_model.eoi_w = get_tensor(new_clip->ctx_data, TN_GLM_EOI_W); + } else if (new_clip->proj_type == PROJECTOR_TYPE_MERGER) { vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight")); vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias")); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 624154fc4..2edc0cc68 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -415,7 +415,6 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co if (clip_is_glm(ctx_clip)) { num_max_patches = 1; } - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model float * image_embd; if (clip_is_qwen2vl(ctx_clip)) { // qwen2vl don't split image into chunks, so `num_max_patches` is not needed.