put all code into llava dir

This commit is contained in:
caitianchi 2024-06-04 15:10:00 +08:00
parent a95a6d995d
commit efe4c61717
20 changed files with 728 additions and 2586 deletions

View File

@ -862,12 +862,12 @@ llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/cli
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
minicpmv-cli: examples/minicpmv/minicpmv-cli.cpp examples/minicpmv/clip.h examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.h examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.h examples/minicpmv/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) -c examples/minicpmv/clip.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
$(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
$(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp) $(CXX) $(CXXFLAGS) -c examples/llava/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp)
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS)
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

View File

@ -35,3 +35,8 @@ add_executable(llava-cli llava-cli.cpp)
install(TARGETS llava-cli RUNTIME) install(TARGETS llava-cli RUNTIME)
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(llava PRIVATE cxx_std_11) target_compile_features(llava PRIVATE cxx_std_11)
add_library(minicpmv_wrapper OBJECT
minicpmv_wrapper.cpp
)
target_link_libraries(minicpmv_wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT})

View File

Before

Width:  |  Height:  |  Size: 304 KiB

After

Width:  |  Height:  |  Size: 304 KiB

View File

@ -3,6 +3,7 @@
// I'll gradually clean and extend it // I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h" #include "clip.h"
#include "common.h"
#include "log.h" #include "log.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
@ -70,26 +71,27 @@ static std::string format(const char * fmt, ...) {
// key constants // key constants
// //
#define KEY_FTYPE "general.file_type" #define KEY_FTYPE "general.file_type"
#define KEY_NAME "general.name" #define KEY_NAME "general.name"
#define KEY_DESCRIPTION "general.description" #define KEY_DESCRIPTION "general.description"
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder" #define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_USE_GELU "clip.use_gelu" #define KEY_HAS_MiniCPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_N_EMBD "clip.%s.embedding_length" #define KEY_USE_GELU "clip.use_gelu"
#define KEY_N_FF "clip.%s.feed_forward_length" #define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_BLOCK "clip.%s.block_count" #define KEY_N_FF "clip.%s.feed_forward_length"
#define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_N_BLOCK "clip.%s.block_count"
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" #define KEY_N_HEAD "clip.%s.attention.head_count"
#define KEY_PROJ_DIM "clip.%s.projection_dim" #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_TOKENS "tokenizer.ggml.tokens" #define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_N_POSITIONS "clip.text.context_length" #define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
@ -122,6 +124,14 @@ static std::string format(const char * fmt, ...) {
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline" #define TN_IMAGE_NEWLINE "model.image_newline"
// MINICPMV
// #define TN_MINICPMV_POS_EMBD "resampler.pos_embed"
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
#define TN_MINICPMV_PROJ "resampler.proj.weight"
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
enum projector_type { enum projector_type {
@ -129,6 +139,7 @@ enum projector_type {
PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_MLP_NORM,
PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDP,
PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_LDPV2,
PROJECTOR_TYPE_RESAMPLER,
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -136,6 +147,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_MLP, "mlp" },
{ PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDP, "ldp" },
{ PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_LDPV2, "ldpv2"},
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
}; };
@ -488,12 +500,34 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_mlp_2_b; struct ggml_tensor * mm_model_mlp_2_b;
struct ggml_tensor * mm_model_peg_0_w; struct ggml_tensor * mm_model_peg_0_w;
struct ggml_tensor * mm_model_peg_0_b; struct ggml_tensor * mm_model_peg_0_b;
// MINICPMV projection
// struct ggml_tensor * mm_model_pos_embed;
struct ggml_tensor * mm_model_pos_embed_k;
struct ggml_tensor * mm_model_query;
struct ggml_tensor * mm_model_proj;
struct ggml_tensor * mm_model_kv_proj;
struct ggml_tensor * mm_model_attn_q_w;
struct ggml_tensor * mm_model_attn_q_b;
struct ggml_tensor * mm_model_attn_k_w;
struct ggml_tensor * mm_model_attn_k_b;
struct ggml_tensor * mm_model_attn_v_w;
struct ggml_tensor * mm_model_attn_v_b;
struct ggml_tensor * mm_model_attn_o_w;
struct ggml_tensor * mm_model_attn_o_b;
struct ggml_tensor * mm_model_ln_q_w;
struct ggml_tensor * mm_model_ln_q_b;
struct ggml_tensor * mm_model_ln_kv_w;
struct ggml_tensor * mm_model_ln_kv_b;
struct ggml_tensor * mm_model_ln_post_w;
struct ggml_tensor * mm_model_ln_post_b;
}; };
struct clip_ctx { struct clip_ctx {
bool has_text_encoder = false; bool has_text_encoder = false;
bool has_vision_encoder = false; bool has_vision_encoder = false;
bool has_llava_projector = false; bool has_llava_projector = false;
bool has_minicpmv_projector = false;
struct clip_vision_model vision_model; struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP; projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -520,7 +554,7 @@ struct clip_ctx {
ggml_gallocr_t compute_alloc = NULL; ggml_gallocr_t compute_alloc = NULL;
}; };
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
@ -529,10 +563,15 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const auto & model = ctx->vision_model; const auto & model = ctx->vision_model;
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size; int image_size_width = image_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); int image_size_height = image_size;
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); if (ctx->has_minicpmv_projector) {
image_size_width = load_image_size.first;
image_size_height = load_image_size.second;
}
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
@ -542,7 +581,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int batch_size = imgs->size; const int batch_size = imgs->size;
if (ctx->has_llava_projector) { if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
} }
@ -555,7 +594,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
ggml_set_name(inp_raw, "inp_raw"); ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw); ggml_set_input(inp_raw);
@ -563,25 +602,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
struct ggml_tensor * embeddings = inp;
struct ggml_tensor * pos_embed;
if (ctx->has_patch_bias) { if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias); inp = ggml_add(ctx0, inp, model.patch_bias);
} }
// concat class_embeddings and patch_embeddings if(ctx->has_llava_projector){
struct ggml_tensor * embeddings = inp; // concat class_embeddings and patch_embeddings
if (ctx->has_class_embedding) { if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings"); ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings); ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp, embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
} }
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
ggml_set_input(positions); ggml_set_input(positions);
@ -589,6 +630,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = embeddings =
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
if(ctx->has_minicpmv_projector){
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
// pre-layernorm // pre-layernorm
if (ctx->has_pre_norm) { if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
@ -687,6 +736,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// llava projector // llava projector
if(ctx->has_llava_projector)
{ {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -864,6 +914,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
embeddings = peg_0; embeddings = peg_0;
} }
else {
GGML_ASSERT(false);
}
}
// minicpmv projector
else if(ctx->has_minicpmv_projector)
{
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
struct ggml_tensor * q = model.mm_model_query;
{ // layernorm
q = ggml_norm(ctx0, q, eps);
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
}
struct ggml_tensor *k, *v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
{ // layernorm
v = ggml_norm(ctx0, v, eps);
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
}
{ // position
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
k = ggml_add(ctx0, v, pos_embed);
}
{ // attention
const int hidden_size = 4096;
const int d_head = 128;
const int n_head = hidden_size/d_head;
const int num_query = 96;
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
// permute
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
KQ = ggml_soft_max_inplace(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
}
{ // layernorm
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
}
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
}
else { else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
@ -878,7 +987,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair<int, int> load_image_size) {
struct ggml_context * meta = NULL; struct ggml_context * meta = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
@ -1020,7 +1129,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx); new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
} }
GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search idx = gguf_find_key(ctx, KEY_HAS_MiniCPMV_PROJ);
if (idx != -1) {
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
}
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(new_clip->has_vision_encoder);
GGML_ASSERT(!new_clip->has_text_encoder); GGML_ASSERT(!new_clip->has_text_encoder);
@ -1031,6 +1146,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
} }
@ -1272,6 +1388,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
} }
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
}
else { else {
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -1310,7 +1447,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
clip_image_f32_batch batch; clip_image_f32_batch batch;
batch.size = 1; batch.size = 1;
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size);
ggml_gallocr_reserve(new_clip->compute_alloc, gf); ggml_gallocr_reserve(new_clip->compute_alloc, gf);
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
@ -1424,6 +1561,19 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
} }
} }
void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) {
dst->nx = src->nx;
dst->ny = src->ny;
dst->buf.resize(src->buf.size());
const auto & m3 = ctx->image_mean;
const auto & s3 = ctx->image_std;
for (size_t i = 0; i < src->buf.size(); ++i) {
int c = i % 3; // rgb
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - m3[c]) / s3[c];
}
}
inline float clip(float x, float lower, float upper) { inline float clip(float x, float lower, float upper) {
return std::max(lower, std::min(x, upper)); return std::max(lower, std::min(x, upper));
} }
@ -1807,12 +1957,100 @@ int clip_n_patches(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
n_patches /= 4; n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
n_patches = 96;
} }
return n_patches; return n_patches;
} }
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>>& pos) {
assert(embed_dim % 2 == 0);
int H = pos.size();
int W = pos[0].size();
std::vector<float> omega(embed_dim / 2);
for (int i = 0; i < embed_dim / 2; ++i) {
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
}
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
for (int d = 0; d < embed_dim / 2; ++d) {
float out_value = pos[h][w] * omega[d];
emb[h][w][d] = sin(out_value);
emb[h][w][d + embed_dim / 2] = cos(out_value);
}
}
}
return emb;
}
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>>& grid) {
assert(embed_dim % 2 == 0);
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
int H = emb_h.size();
int W = emb_h[0].size();
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
for (int d = 0; d < embed_dim / 2; ++d) {
emb[h][w][d] = emb_h[h][w][d];
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
}
}
}
return emb;
}
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
int grid_h_size = image_size.first;
int grid_w_size = image_size.second;
std::vector<float> grid_h(grid_h_size);
std::vector<float> grid_w(grid_w_size);
for (int i = 0; i < grid_h_size; ++i) {
grid_h[i] = static_cast<float>(i);
}
for (int i = 0; i < grid_w_size; ++i) {
grid_w[i] = static_cast<float>(i);
}
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
for (int h = 0; h < grid_h_size; ++h) {
for (int w = 0; w < grid_w_size; ++w) {
grid[h][w] = grid_w[w];
}
}
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
for (int h = 0; h < grid_h_size; ++h) {
for (int w = 0; w < grid_w_size; ++w) {
grid_2d[0][h][w] = grid_h[h];
grid_2d[1][h][w] = grid_w[w];
}
}
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
int H = image_size.first;
int W = image_size.second;
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
for (int h = 0; h < H; ++h) {
for (int w = 0; w < W; ++w) {
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
}
}
return pos_embed_2d;
}
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -1821,10 +2059,10 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
clip_image_f32_batch imgs{}; clip_image_f32_batch imgs{};
imgs.size = 1; imgs.size = 1;
imgs.data = img; imgs.data = img;
return clip_image_batch_encode(ctx, n_threads, &imgs, vec); return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
} }
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -1834,6 +2072,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
if (ctx->has_llava_projector) { if (ctx->has_llava_projector) {
GGML_ASSERT(batch_size == 1); // TODO: support multiple images GGML_ASSERT(batch_size == 1); // TODO: support multiple images
} }
if (ctx->has_minicpmv_projector) {
GGML_ASSERT(batch_size == 1);
}
// build the inference graph // build the inference graph
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
@ -1844,8 +2085,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
int image_size_width = image_size;
int image_size_height = image_size;
if (ctx->has_minicpmv_projector) {
image_size_width = load_image_size.first;
image_size_height = load_image_size.second;
}
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
{ {
@ -1855,7 +2102,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
for (size_t i = 0; i < imgs->size; i++) { for (size_t i = 0; i < imgs->size; i++) {
const int nx = imgs->data[i].nx; const int nx = imgs->data[i].nx;
const int ny = imgs->data[i].ny; const int ny = imgs->data[i].ny;
GGML_ASSERT(nx == image_size && ny == image_size); if (!ctx->has_minicpmv_projector) {
GGML_ASSERT(nx == image_size && ny == image_size);
}
const int n = nx * ny; const int n = nx * ny;
@ -1872,37 +2121,74 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
free(data); free(data);
} }
if (ctx->has_minicpmv_projector) {
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
{ int* positions_data = (int*)malloc(ggml_nbytes(positions));
if (ctx->has_class_embedding) { for (int i = 0; i < num_positions; i++) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); positions_data[i] = std::floor(70.0*i/num_positions);
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
void* zero_mem = malloc(ggml_nbytes(embeddings)); {
memset(zero_mem, 0, ggml_nbytes(embeddings)); // inspired from resampler of Qwen-VL:
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
free(zero_mem); // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
int embed_dim = 4096;
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
for(int i=0;i<pos_w * pos_h;++i){
for(int j=0;j<embed_dim;++j){
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
}
}
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
} }
} }
else{
{
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
{ void* zero_mem = malloc(ggml_nbytes(embeddings));
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
int* positions_data = (int*)malloc(ggml_nbytes(positions)); free(zero_mem);
for (int i = 0; i < num_positions; i++) { }
positions_data[i] = i;
} }
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{ {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) { int* positions_data = (int*)malloc(ggml_nbytes(positions));
patches_data[i] = i + 1; for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
}
{
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + 1;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
} }
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
} }
if (ggml_backend_is_cpu(ctx->backend)) { if (ggml_backend_is_cpu(ctx->backend)) {
@ -2072,6 +2358,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
return ctx->vision_model.mm_3_b->ne[0]; return ctx->vision_model.mm_3_b->ne[0];
} }
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
return 4096;
}
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));

View File

@ -3,6 +3,7 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <utility>
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
@ -36,7 +37,7 @@ struct clip_image_f32_batch {
size_t size; size_t size;
}; };
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair<int, int> load_image_size = {448, 448});
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
@ -71,10 +72,12 @@ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t byt
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst);
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size = {448, 448});
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size = {448, 448});
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

View File

@ -409,3 +409,342 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed->embed); free(embed->embed);
free(embed); free(embed);
} }
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v;
// format VectN x H x W x RGB (N x 448 x 448 x 3)
clip_image_f32 * img_res_v = clip_image_f32_init();
std::pair<int, int> load_image_size;
load_image_size.first = img->nx;
load_image_size.second = img->ny;
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
const int64_t t_img_enc_start_us = ggml_time_us();
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096
if (!encoded) {
LOG_TEE("Unable to encode image\n");
return false;
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
static int ensure_divide(int length, int patch_size) {
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width = original_size.first;
int height = original_size.second;
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
float r = static_cast<float>(width) / height;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
}
int best_width = ensure_divide(width, patch_size);
int best_height = ensure_divide(height, patch_size);
return std::make_pair(best_width, best_height);
}
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width, height;
std::tie(width, height) = original_size;
int grid_x, grid_y;
std::tie(grid_x, grid_y) = grid;
int refine_width = ensure_divide(width, grid_x);
int refine_height = ensure_divide(height, grid_y);
int grid_width = refine_width / grid_x;
int grid_height = refine_height / grid_y;
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
int best_grid_width, best_grid_height;
std::tie(best_grid_width, best_grid_height) = best_grid_size;
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
return refine_size;
}
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
const int nx = img.nx;
const int ny = img.ny;
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
float Cc;
float C[5];
float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj;
int x, y;
float dx, dy;
float tx, ty;
tx = (float)nx / (float)target_width;
ty = (float)ny / (float)target_height;
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
for (i = 0; i < target_height; i++) {
for (j = 0; j < target_width; j++) {
x = (int)(tx * j);
y = (int)(ty * i);
dx = tx * j - x;
dy = ty * i - y;
for (k = 0; k < 3; k++) {
for (jj = 0; jj <= 3; jj++) {
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
d0 = C[0] - C[1];
d2 = C[2] - C[1];
d3 = C[3] - C[1];
a0 = C[1];
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
}
}
}
}
return true;
}
// inspired from LLaVA-UHD:
// -> https://arxiv.org/pdf/2403.11703
// -> https://github.com/thunlp/LLaVA-UHD
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
const std::pair<int, int> original_size={img->nx,img->ny};
const int original_width = img->nx;
const int original_height = img->ny;
const float log_ratio = log(1.0*original_width/original_height); //
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images;
LOG_TEE("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>());
if(multiple <= 1){
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8 *source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image);
}
else if(multiple > 1){
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
if (i == 1 || i > max_slice_nums) {
continue;
}
candidate_split_grids_nums.push_back(i);
}
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8 *source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::vector<std::pair<int, int>> candidate_grids;
for (int split_grids_nums : candidate_split_grids_nums) {
int m = 1;
while (m <= split_grids_nums) {
if (split_grids_nums % m == 0) {
candidate_grids.emplace_back(m, split_grids_nums / m);
}
++m;
}
}
std::pair<int, int> best_grid{1, 1};
float min_error = std::numeric_limits<float>::infinity();
for (const auto& grid : candidate_grids) {
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
if (error < min_error) {
best_grid = grid;
min_error = error;
}
}
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 *refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image->nx;
int height = refine_image->ny;
int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<clip_image_u8 *>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
clip_image_u8 * patch = clip_image_u8_init();
patch->nx = grid_x;
patch->ny = grid_y;
patch->buf.resize(3 * patch->nx * patch->ny);
for (int y = patches_i; y < patches_i + grid_y; ++y) {
for (int x = patches_j; x < patches_j + grid_x; ++x) {
const int i = 3 * (y * refine_image->nx + x);
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
patch->buf[j] = refine_image->buf[i];
patch->buf[j+1] = refine_image->buf[i+1];
patch->buf[j+2] = refine_image->buf[i+2];
}
}
images[images.size()-1].push_back(patch);
}
}
}
return images;
}
struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
for (size_t i = 0; i < imgs.size(); ++i){
for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
}
}
struct uhd_image_embed * results = new uhd_image_embed();
for (size_t i = 0; i < imgs.size(); ++i){
results->image_embeds.push_back(std::vector<llava_image_embed *>());
for (size_t j = 0; j < imgs[i].size(); ++j) {
float* image_embed = NULL;
int n_image_pos = 0;
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
if (!image_embed_result) {
LOG_TEE("%s: coulnd't embed the image\n", __func__);
return NULL;
}
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
result->embed = image_embed;
result->n_image_pos = n_image_pos;
results->image_embeds[i].push_back(result);
}
}
return results;
}
bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
auto image_embed_slices = embeds->image_embeds;
if (!image_embed_slices[0][0]){
LOG_TEE("%s: failed to embeding image\n", __func__);
return false;
}
std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw";
unsigned char* slice_token;
long image_bytes_length;
auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str());
return false;
}
float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61);
int all_n_img_pos=0;
int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len);
all_n_img_pos+=clip_n_patches(ctx_clip);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
if (image_embed_slices.size() > 1) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len);
for (size_t i = 1; i < image_embed_slices.size(); ++i) {
for (size_t j = 0; j < image_embed_slices[i].size(); ++j) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len);
all_n_img_pos+=clip_n_patches(ctx_clip);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
if (j == image_embed_slices[i].size() - 1) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len);
}
}
}
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len);
}
*image_embd_out = all_image_embd;
*n_img_pos_out = all_n_img_pos;
return true;
}
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
return NULL;
}
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
clip_image_u8_free(img);
free(image_bytes);
return embeds;
}
void llava_image_embed_free_uhd(struct uhd_image_embed * embed) {
for (size_t i = 0; i < embed->image_embeds.size(); ++i){
for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){
free(embed->image_embeds[i][j]->embed);
free(embed->image_embeds[i][j]);
}
embed->image_embeds[i] = std::vector<struct llava_image_embed *>();
}
embed->image_embeds = std::vector<std::vector<struct llava_image_embed *>>();
}

View File

@ -19,6 +19,10 @@
struct clip_ctx; struct clip_ctx;
struct uhd_image_embed {
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
};
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
@ -40,6 +44,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */ /** free an embedding made with llava_image_embed_make_* */
/** build an image embed from image file bytes */
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
/** build an image embed from a path to an image filename */
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);

View File

@ -2,7 +2,7 @@
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "clip.h" #include "clip.h"
#include "minicpmv.h" #include "llava.h"
#include "minicpmv_wrapper.h" #include "minicpmv_wrapper.h"
#include "llama.h" #include "llama.h"

View File

@ -1,7 +1,7 @@
#include "ggml.h" #include "ggml.h"
#include "common.h" #include "common.h"
#include "clip.h" #include "clip.h"
#include "minicpmv.h" #include "llava.h"
#include "minicpmv_wrapper.h" #include "minicpmv_wrapper.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>

View File

@ -3,7 +3,7 @@
#include "common.h" #include "common.h"
#include "clip.h" #include "clip.h"
#include "minicpmv.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED

View File

@ -1,3 +1,4 @@
-r ../../requirements/requirements-convert.txt -r ../../requirements/requirements-convert.txt
pillow~=10.2.0 pillow~=10.2.0
torch~=2.1.1 torch~=2.1.1
torchvision==0.16.2

View File

@ -1,42 +0,0 @@
add_library(minicpmv OBJECT
minicpmv.cpp
minicpmv.h
clip.cpp
clip.h
)
target_link_libraries(minicpmv PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_include_directories(minicpmv PUBLIC .)
target_include_directories(minicpmv PUBLIC ../..)
target_include_directories(minicpmv PUBLIC ../../common)
target_compile_features(minicpmv PRIVATE cxx_std_11)
add_library(minicpmv_static STATIC $<TARGET_OBJECTS:minicpmv>)
if (BUILD_SHARED_LIBS)
set_target_properties(minicpmv PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(minicpmv PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(minicpmv_shared SHARED $<TARGET_OBJECTS:minicpmv>)
target_link_libraries(minicpmv_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS minicpmv_shared LIBRARY)
endif()
if (NOT MSVC)
target_compile_options(minicpmv PRIVATE -Wno-cast-qual) # stb_image.h
endif()
if(TARGET BUILD_INFO)
add_dependencies(minicpmv BUILD_INFO)
endif()
set(TARGET minicpmv-cli)
add_executable(minicpmv-cli minicpmv-cli.cpp)
install(TARGETS minicpmv-cli RUNTIME)
target_link_libraries(minicpmv-cli PRIVATE common minicpmv_wrapper minicpmv ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(minicpmv PRIVATE cxx_std_11)
add_library(minicpmv_wrapper OBJECT
minicpmv_wrapper.cpp
)
target_link_libraries(minicpmv_wrapper PRIVATE minicpmv ${CMAKE_THREAD_LIBS_INIT})

File diff suppressed because it is too large Load Diff

View File

@ -1,85 +0,0 @@
#ifndef CLIP_H
#define CLIP_H
#include <stddef.h>
#include <stdint.h>
#include <utility>
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define CLIP_API __declspec(dllexport)
# else
# define CLIP_API __declspec(dllimport)
# endif
# else
# define CLIP_API __attribute__ ((visibility ("default")))
# endif
#else
# define CLIP_API
#endif
struct clip_ctx;
#ifdef __cplusplus
extern "C" {
#endif
struct clip_ctx;
struct clip_image_u8_batch {
struct clip_image_u8 * data;
size_t size;
};
struct clip_image_f32_batch {
struct clip_image_f32 * data;
size_t size;
};
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair<int, int> load_image_size);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
// TODO: should be enum, not string
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init();
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
CLIP_API void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst);
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
#ifdef __cplusplus
}
#endif
#endif // CLIP_H

View File

@ -1,452 +0,0 @@
#include "clip.h"
#include "common.h"
#include "llama.h"
#include "minicpmv.h"
#include "base64.hpp"
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <numeric>
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
struct clip_image_grid_shape {
int first;
int second;
};
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v;
// format VectN x H x W x RGB (N x 448 x 448 x 3)
clip_image_f32 * img_res_v = clip_image_f32_init();
std::pair<int, int> load_image_size;
load_image_size.first = img->nx;
load_image_size.second = img->ny;
normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
const int64_t t_img_enc_start_us = ggml_time_us();
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096
if (!encoded) {
LOG_TEE("Unable to encode image\n");
return false;
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
// make sure that the correct mmproj was used, i.e., compare apples to apples
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
if (n_image_embd != n_llama_embd) {
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
return false;
}
return true;
}
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6);
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
*image_embd_out = image_embd;
*n_img_pos_out = n_img_pos;
return true;
}
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
int n_eval = image_embed->n_image_pos - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
if (llama_decode(ctx_llama, batch)) {
LOG_TEE("%s : failed to eval\n", __func__);
return false;
}
*n_past += n_eval;
}
return true;
}
static int ensure_divide(int length, int patch_size) {
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width = original_size.first;
int height = original_size.second;
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
float r = static_cast<float>(width) / height;
height = static_cast<int>(scale_resolution / std::sqrt(r));
width = static_cast<int>(height * r);
}
int best_width = ensure_divide(width, patch_size);
int best_height = ensure_divide(height, patch_size);
return std::make_pair(best_width, best_height);
}
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
int width, height;
std::tie(width, height) = original_size;
int grid_x, grid_y;
std::tie(grid_x, grid_y) = grid;
int refine_width = ensure_divide(width, grid_x);
int refine_height = ensure_divide(height, grid_y);
int grid_width = refine_width / grid_x;
int grid_height = refine_height / grid_y;
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
int best_grid_width, best_grid_height;
std::tie(best_grid_width, best_grid_height) = best_grid_size;
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
return refine_size;
}
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
const int nx = img.nx;
const int ny = img.ny;
dst.nx = target_width;
dst.ny = target_height;
dst.buf.resize(3 * target_width * target_height);
float Cc;
float C[5];
float d0, d2, d3, a0, a1, a2, a3;
int i, j, k, jj;
int x, y;
float dx, dy;
float tx, ty;
tx = (float)nx / (float)target_width;
ty = (float)ny / (float)target_height;
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
for (i = 0; i < target_height; i++) {
for (j = 0; j < target_width; j++) {
x = (int)(tx * j);
y = (int)(ty * i);
dx = tx * j - x;
dy = ty * i - y;
for (k = 0; k < 3; k++) {
for (jj = 0; jj <= 3; jj++) {
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
d0 = C[0] - C[1];
d2 = C[2] - C[1];
d3 = C[3] - C[1];
a0 = C[1];
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
}
}
}
}
return true;
}
// inspired from LLaVA-UHD:
// -> https://arxiv.org/pdf/2403.11703
// -> https://github.com/thunlp/LLaVA-UHD
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
const std::pair<int, int> original_size={img->nx,img->ny};
const int original_width = img->nx;
const int original_height = img->ny;
const float log_ratio = log(1.0*original_width/original_height); //
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
const int multiple = fmin(ceil(ratio), max_slice_nums);
std::vector<std::vector<clip_image_u8 *>> images;
LOG_TEE("%s: multiple %d\n", __func__, multiple);
images.push_back(std::vector<clip_image_u8 *>());
if(multiple <= 1){
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
clip_image_u8 *source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
images[images.size()-1].push_back(source_image);
}
else if(multiple > 1){
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
if (i == 1 || i > max_slice_nums) {
continue;
}
candidate_split_grids_nums.push_back(i);
}
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
clip_image_u8 *source_image = clip_image_u8_init();
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
images[images.size()-1].push_back(source_image);
std::vector<std::pair<int, int>> candidate_grids;
for (int split_grids_nums : candidate_split_grids_nums) {
int m = 1;
while (m <= split_grids_nums) {
if (split_grids_nums % m == 0) {
candidate_grids.emplace_back(m, split_grids_nums / m);
}
++m;
}
}
std::pair<int, int> best_grid{1, 1};
float min_error = std::numeric_limits<float>::infinity();
for (const auto& grid : candidate_grids) {
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
if (error < min_error) {
best_grid = grid;
min_error = error;
}
}
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
clip_image_u8 *refine_image = clip_image_u8_init();
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
// split_to_patches
int width = refine_image->nx;
int height = refine_image->ny;
int grid_x = int(width / best_grid.first);
int grid_y = int(height / best_grid.second);
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
images.push_back(std::vector<clip_image_u8 *>());
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
clip_image_u8 * patch = clip_image_u8_init();
patch->nx = grid_x;
patch->ny = grid_y;
patch->buf.resize(3 * patch->nx * patch->ny);
for (int y = patches_i; y < patches_i + grid_y; ++y) {
for (int x = patches_j; x < patches_j + grid_x; ++x) {
const int i = 3 * (y * refine_image->nx + x);
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
patch->buf[j] = refine_image->buf[i];
patch->buf[j+1] = refine_image->buf[i+1];
patch->buf[j+2] = refine_image->buf[i+2];
}
}
images[images.size()-1].push_back(patch);
}
}
}
return images;
}
struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
for (size_t i = 0; i < imgs.size(); ++i){
for (size_t j = 0; j < imgs[i].size(); ++j) {
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
}
}
struct uhd_image_embed * results = new uhd_image_embed();
for (size_t i = 0; i < imgs.size(); ++i){
results->image_embeds.push_back(std::vector<llava_image_embed *>());
for (size_t j = 0; j < imgs[i].size(); ++j) {
float* image_embed = NULL;
int n_image_pos = 0;
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
if (!image_embed_result) {
LOG_TEE("%s: coulnd't embed the image\n", __func__);
return NULL;
}
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
result->embed = image_embed;
result->n_image_pos = n_image_pos;
results->image_embeds[i].push_back(result);
}
}
return results;
}
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
auto file = fopen(path, "rb");
if (file == NULL) {
LOG_TEE("%s: can't read file %s\n", __func__, path);
return false;
}
fseek(file, 0, SEEK_END);
auto fileSize = ftell(file);
fseek(file, 0, SEEK_SET);
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
if (buffer == NULL) {
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
perror("Memory allocation error");
fclose(file);
return false;
}
errno = 0;
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
if (ferror(file)) {
die_fmt("read error: %s", strerror(errno));
}
if (ret != (size_t) fileSize) {
die("unexpectedly reached end of file");
}
fclose(file); // Close the file
*bytesOut = buffer;
*sizeOut = fileSize;
return true;
}
bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
auto image_embed_slices = embeds->image_embeds;
if (!image_embed_slices[0][0]){
LOG_TEE("%s: failed to embeding image\n", __func__);
return false;
}
std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw";
unsigned char* slice_token;
long image_bytes_length;
auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str());
return false;
}
float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61);
int all_n_img_pos=0;
int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len);
all_n_img_pos+=clip_n_patches(ctx_clip);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
if (image_embed_slices.size() > 1) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len);
for (size_t i = 1; i < image_embed_slices.size(); ++i) {
for (size_t j = 0; j < image_embed_slices[i].size(); ++j) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len);
all_n_img_pos+=clip_n_patches(ctx_clip);
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
if (j == image_embed_slices[i].size() - 1) {
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len);
}
}
}
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len);
}
*image_embd_out = all_image_embd;
*n_img_pos_out = all_n_img_pos;
return true;
}
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
if (!loaded) {
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
return NULL;
}
clip_image_u8 * img = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
clip_image_u8_free(img);
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
return NULL;
}
struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
clip_image_u8_free(img);
free(image_bytes);
return embeds;
}
void llava_image_embed_free_uhd(struct uhd_image_embed * embed) {
for (size_t i = 0; i < embed->image_embeds.size(); ++i){
for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){
free(embed->image_embeds[i][j]->embed);
free(embed->image_embeds[i][j]);
}
embed->image_embeds[i] = std::vector<struct llava_image_embed *>();
}
embed->image_embeds = std::vector<std::vector<struct llava_image_embed *>>();
}

View File

@ -1,54 +0,0 @@
#ifndef LLAVA_H
#define LLAVA_H
#include "ggml.h"
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define MINICPMV_API __declspec(dllexport)
# else
# define MINICPMV_API __declspec(dllimport)
# endif
# else
# define MINICPMV_API __attribute__ ((visibility ("default")))
# endif
#else
# define MINICPMV_API
#endif
struct clip_ctx;
struct uhd_image_embed {
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
};
#ifdef __cplusplus
extern "C" {
#endif
struct llava_image_embed {
float * embed;
int n_image_pos;
};
/** sanity check for clip <-> llava embed size match */
MINICPMV_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
MINICPMV_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
/** build an image embed from image file bytes */
MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
/** build an image embed from a path to an image filename */
MINICPMV_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
MINICPMV_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
/** free an embedding made with llava_image_embed_make_* */
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
MINICPMV_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,4 +0,0 @@
-r ../../requirements/requirements-convert.txt
pillow~=10.2.0
torch~=2.1.1
torchvision==0.16.2