mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 11:40:17 +00:00
Merge pull request #11 from OpenBMB/pr_add_all_in_llava
Pr add all in llava
This commit is contained in:
commit
ee5b850958
10
Makefile
10
Makefile
@ -910,12 +910,12 @@ llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/cli
|
|||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
minicpmv-cli: examples/minicpmv/minicpmv-cli.cpp examples/minicpmv/clip.h examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.h examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.h examples/minicpmv/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/minicpmv/clip.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/minicpmv/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/minicpmv/clip.cpp examples/minicpmv/minicpmv.cpp examples/minicpmv/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/minicpmv/clip.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv.cpp) $(call GET_OBJ_FILE, examples/minicpmv/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -673,6 +673,44 @@ class GPTNeoXModel(Model):
|
|||||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||||
|
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||||
|
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||||
|
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||||
|
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||||
|
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||||
|
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||||
|
data_torch = torch.cat(
|
||||||
|
(
|
||||||
|
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||||
|
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
||||||
|
),
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
logger.info("re-format attention.linear_qkv.weight")
|
||||||
|
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
|
||||||
|
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
||||||
|
data_torch = torch.cat(
|
||||||
|
(
|
||||||
|
qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||||
|
qkv_bias[:, 2, :].reshape((n_embed,)),
|
||||||
|
),
|
||||||
|
dim=0,
|
||||||
|
)
|
||||||
|
logger.info("re-format attention.linear_qkv.bias")
|
||||||
|
|
||||||
|
tensors.append((self.map_tensor_name(name), data_torch))
|
||||||
|
|
||||||
|
return tensors
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BloomForCausalLM")
|
@Model.register("BloomForCausalLM")
|
||||||
class BloomModel(Model):
|
class BloomModel(Model):
|
||||||
|
@ -35,3 +35,8 @@ add_executable(llava-cli llava-cli.cpp)
|
|||||||
install(TARGETS llava-cli RUNTIME)
|
install(TARGETS llava-cli RUNTIME)
|
||||||
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(llava PRIVATE cxx_std_11)
|
target_compile_features(llava PRIVATE cxx_std_11)
|
||||||
|
|
||||||
|
add_library(minicpmv_wrapper OBJECT
|
||||||
|
minicpmv_wrapper.cpp
|
||||||
|
)
|
||||||
|
target_link_libraries(minicpmv_wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT})
|
Before Width: | Height: | Size: 304 KiB After Width: | Height: | Size: 304 KiB |
@ -3,6 +3,7 @@
|
|||||||
// I'll gradually clean and extend it
|
// I'll gradually clean and extend it
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
|
#include "common.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
@ -76,6 +77,7 @@ static std::string format(const char * fmt, ...) {
|
|||||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||||
|
#define KEY_HAS_MiniCPMV_PROJ "clip.has_minicpmv_projector"
|
||||||
#define KEY_USE_GELU "clip.use_gelu"
|
#define KEY_USE_GELU "clip.use_gelu"
|
||||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||||
@ -122,6 +124,14 @@ static std::string format(const char * fmt, ...) {
|
|||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
|
// MINICPMV
|
||||||
|
// #define TN_MINICPMV_POS_EMBD "resampler.pos_embed"
|
||||||
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
|
#define TN_MINICPMV_QUERY "resampler.query"
|
||||||
|
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||||
|
#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
|
||||||
|
#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
|
||||||
|
#define TN_MINICPMV_LN "resampler.ln_%s.%s"
|
||||||
|
|
||||||
|
|
||||||
enum projector_type {
|
enum projector_type {
|
||||||
@ -129,6 +139,7 @@ enum projector_type {
|
|||||||
PROJECTOR_TYPE_MLP_NORM,
|
PROJECTOR_TYPE_MLP_NORM,
|
||||||
PROJECTOR_TYPE_LDP,
|
PROJECTOR_TYPE_LDP,
|
||||||
PROJECTOR_TYPE_LDPV2,
|
PROJECTOR_TYPE_LDPV2,
|
||||||
|
PROJECTOR_TYPE_RESAMPLER,
|
||||||
PROJECTOR_TYPE_UNKNOWN,
|
PROJECTOR_TYPE_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -136,6 +147,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||||
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
{ PROJECTOR_TYPE_LDPV2, "ldpv2"},
|
||||||
|
{ PROJECTOR_TYPE_RESAMPLER, "resampler"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -488,12 +500,34 @@ struct clip_vision_model {
|
|||||||
struct ggml_tensor * mm_model_mlp_2_b;
|
struct ggml_tensor * mm_model_mlp_2_b;
|
||||||
struct ggml_tensor * mm_model_peg_0_w;
|
struct ggml_tensor * mm_model_peg_0_w;
|
||||||
struct ggml_tensor * mm_model_peg_0_b;
|
struct ggml_tensor * mm_model_peg_0_b;
|
||||||
|
|
||||||
|
// MINICPMV projection
|
||||||
|
// struct ggml_tensor * mm_model_pos_embed;
|
||||||
|
struct ggml_tensor * mm_model_pos_embed_k;
|
||||||
|
struct ggml_tensor * mm_model_query;
|
||||||
|
struct ggml_tensor * mm_model_proj;
|
||||||
|
struct ggml_tensor * mm_model_kv_proj;
|
||||||
|
struct ggml_tensor * mm_model_attn_q_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_q_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_k_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_k_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_v_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_v_b;
|
||||||
|
struct ggml_tensor * mm_model_attn_o_w;
|
||||||
|
struct ggml_tensor * mm_model_attn_o_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_q_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_q_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_kv_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_kv_b;
|
||||||
|
struct ggml_tensor * mm_model_ln_post_w;
|
||||||
|
struct ggml_tensor * mm_model_ln_post_b;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct clip_ctx {
|
struct clip_ctx {
|
||||||
bool has_text_encoder = false;
|
bool has_text_encoder = false;
|
||||||
bool has_vision_encoder = false;
|
bool has_vision_encoder = false;
|
||||||
bool has_llava_projector = false;
|
bool has_llava_projector = false;
|
||||||
|
bool has_minicpmv_projector = false;
|
||||||
|
|
||||||
struct clip_vision_model vision_model;
|
struct clip_vision_model vision_model;
|
||||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||||
@ -520,7 +554,7 @@ struct clip_ctx {
|
|||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -530,9 +564,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
image_size_width = load_image_size.first;
|
||||||
|
image_size_height = load_image_size.second;
|
||||||
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
|
||||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
const int hidden_size = hparams.hidden_size;
|
const int hidden_size = hparams.hidden_size;
|
||||||
const int n_head = hparams.n_head;
|
const int n_head = hparams.n_head;
|
||||||
@ -542,7 +581,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
|
|
||||||
const int batch_size = imgs->size;
|
const int batch_size = imgs->size;
|
||||||
|
|
||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
|
||||||
GGML_ASSERT(batch_size == 1);
|
GGML_ASSERT(batch_size == 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -555,7 +594,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
|
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
|
||||||
ggml_set_name(inp_raw, "inp_raw");
|
ggml_set_name(inp_raw, "inp_raw");
|
||||||
ggml_set_input(inp_raw);
|
ggml_set_input(inp_raw);
|
||||||
|
|
||||||
@ -563,14 +602,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
|
|
||||||
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
||||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||||
|
struct ggml_tensor * embeddings = inp;
|
||||||
|
struct ggml_tensor * pos_embed;
|
||||||
|
|
||||||
if (ctx->has_patch_bias) {
|
if (ctx->has_patch_bias) {
|
||||||
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
||||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(ctx->has_llava_projector){
|
||||||
// concat class_embeddings and patch_embeddings
|
// concat class_embeddings and patch_embeddings
|
||||||
struct ggml_tensor * embeddings = inp;
|
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
ggml_set_name(embeddings, "embeddings");
|
ggml_set_name(embeddings, "embeddings");
|
||||||
@ -580,7 +621,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
ggml_set_name(positions, "positions");
|
ggml_set_name(positions, "positions");
|
||||||
@ -589,6 +630,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
embeddings =
|
embeddings =
|
||||||
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
||||||
|
|
||||||
|
if(ctx->has_minicpmv_projector){
|
||||||
|
int pos_w = image_size_width/patch_size;
|
||||||
|
int pos_h = image_size_height/patch_size;
|
||||||
|
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
|
||||||
|
ggml_set_name(pos_embed, "pos_embed");
|
||||||
|
ggml_set_input(pos_embed);
|
||||||
|
}
|
||||||
|
|
||||||
// pre-layernorm
|
// pre-layernorm
|
||||||
if (ctx->has_pre_norm) {
|
if (ctx->has_pre_norm) {
|
||||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
@ -687,6 +736,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
}
|
}
|
||||||
|
|
||||||
// llava projector
|
// llava projector
|
||||||
|
if(ctx->has_llava_projector)
|
||||||
{
|
{
|
||||||
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
||||||
|
|
||||||
@ -864,6 +914,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||||
embeddings = peg_0;
|
embeddings = peg_0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
else {
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// minicpmv projector
|
||||||
|
else if(ctx->has_minicpmv_projector)
|
||||||
|
{
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
struct ggml_tensor * q = model.mm_model_query;
|
||||||
|
{ // layernorm
|
||||||
|
q = ggml_norm(ctx0, q, eps);
|
||||||
|
q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||||
|
}
|
||||||
|
struct ggml_tensor *k, *v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||||
|
{ // layernorm
|
||||||
|
v = ggml_norm(ctx0, v, eps);
|
||||||
|
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
|
||||||
|
}
|
||||||
|
{ // position
|
||||||
|
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
|
||||||
|
k = ggml_add(ctx0, v, pos_embed);
|
||||||
|
}
|
||||||
|
|
||||||
|
{ // attention
|
||||||
|
const int hidden_size = 4096;
|
||||||
|
const int d_head = 128;
|
||||||
|
const int n_head = hidden_size/d_head;
|
||||||
|
const int num_query = 96;
|
||||||
|
|
||||||
|
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
|
||||||
|
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
|
||||||
|
struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
|
||||||
|
struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
|
||||||
|
// permute
|
||||||
|
Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
|
||||||
|
Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
|
||||||
|
Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
|
||||||
|
K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
|
||||||
|
K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
|
||||||
|
K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
|
||||||
|
V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
|
||||||
|
V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
|
||||||
|
V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
KQ = ggml_soft_max_inplace(ctx0, KQ);
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
|
||||||
|
KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
|
||||||
|
KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
|
||||||
|
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
|
||||||
|
}
|
||||||
|
{ // layernorm
|
||||||
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||||
|
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
|
||||||
|
}
|
||||||
|
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
@ -878,7 +987,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair<int, int> load_image_size) {
|
||||||
struct ggml_context * meta = NULL;
|
struct ggml_context * meta = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
@ -1020,7 +1129,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
idx = gguf_find_key(ctx, KEY_HAS_MiniCPMV_PROJ);
|
||||||
|
if (idx != -1) {
|
||||||
|
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
|
||||||
|
|
||||||
GGML_ASSERT(new_clip->has_vision_encoder);
|
GGML_ASSERT(new_clip->has_vision_encoder);
|
||||||
GGML_ASSERT(!new_clip->has_text_encoder);
|
GGML_ASSERT(!new_clip->has_text_encoder);
|
||||||
|
|
||||||
@ -1031,6 +1146,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
|
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
@ -1272,6 +1388,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
|
||||||
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
|
||||||
}
|
}
|
||||||
|
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
|
||||||
|
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
|
||||||
|
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
|
||||||
|
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
|
||||||
|
vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
|
||||||
|
vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
|
||||||
|
vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
|
||||||
|
vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
|
||||||
|
vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
|
||||||
|
vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
|
||||||
|
vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
|
||||||
|
vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
|
||||||
|
vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
|
||||||
|
vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
|
||||||
|
vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
|
||||||
|
vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
|
||||||
|
vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
|
||||||
|
vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
|
||||||
|
vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
@ -1310,7 +1447,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||||||
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
|
||||||
clip_image_f32_batch batch;
|
clip_image_f32_batch batch;
|
||||||
batch.size = 1;
|
batch.size = 1;
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, load_image_size);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
@ -1424,6 +1561,19 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst) {
|
||||||
|
dst->nx = src->nx;
|
||||||
|
dst->ny = src->ny;
|
||||||
|
dst->buf.resize(src->buf.size());
|
||||||
|
const auto & m3 = ctx->image_mean;
|
||||||
|
const auto & s3 = ctx->image_std;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < src->buf.size(); ++i) {
|
||||||
|
int c = i % 3; // rgb
|
||||||
|
dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - m3[c]) / s3[c];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inline float clip(float x, float lower, float upper) {
|
inline float clip(float x, float lower, float upper) {
|
||||||
return std::max(lower, std::min(x, upper));
|
return std::max(lower, std::min(x, upper));
|
||||||
}
|
}
|
||||||
@ -1807,12 +1957,100 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
|||||||
|
|
||||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
|
||||||
n_patches /= 4;
|
n_patches /= 4;
|
||||||
|
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
n_patches = 96;
|
||||||
}
|
}
|
||||||
|
|
||||||
return n_patches;
|
return n_patches;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>>& pos) {
|
||||||
|
assert(embed_dim % 2 == 0);
|
||||||
|
int H = pos.size();
|
||||||
|
int W = pos[0].size();
|
||||||
|
|
||||||
|
std::vector<float> omega(embed_dim / 2);
|
||||||
|
for (int i = 0; i < embed_dim / 2; ++i) {
|
||||||
|
omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||||
|
float out_value = pos[h][w] * omega[d];
|
||||||
|
emb[h][w][d] = sin(out_value);
|
||||||
|
emb[h][w][d + embed_dim / 2] = cos(out_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return emb;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>>& grid) {
|
||||||
|
assert(embed_dim % 2 == 0);
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
|
||||||
|
|
||||||
|
int H = emb_h.size();
|
||||||
|
int W = emb_h[0].size();
|
||||||
|
std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
|
||||||
|
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
for (int d = 0; d < embed_dim / 2; ++d) {
|
||||||
|
emb[h][w][d] = emb_h[h][w][d];
|
||||||
|
emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return emb;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
|
||||||
|
int grid_h_size = image_size.first;
|
||||||
|
int grid_w_size = image_size.second;
|
||||||
|
|
||||||
|
std::vector<float> grid_h(grid_h_size);
|
||||||
|
std::vector<float> grid_w(grid_w_size);
|
||||||
|
|
||||||
|
for (int i = 0; i < grid_h_size; ++i) {
|
||||||
|
grid_h[i] = static_cast<float>(i);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < grid_w_size; ++i) {
|
||||||
|
grid_w[i] = static_cast<float>(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
|
||||||
|
for (int h = 0; h < grid_h_size; ++h) {
|
||||||
|
for (int w = 0; w < grid_w_size; ++w) {
|
||||||
|
grid[h][w] = grid_w[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
|
||||||
|
for (int h = 0; h < grid_h_size; ++h) {
|
||||||
|
for (int w = 0; w < grid_w_size; ++w) {
|
||||||
|
grid_2d[0][h][w] = grid_h[h];
|
||||||
|
grid_2d[1][h][w] = grid_w[w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
|
||||||
|
|
||||||
|
int H = image_size.first;
|
||||||
|
int W = image_size.second;
|
||||||
|
std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
|
||||||
|
for (int h = 0; h < H; ++h) {
|
||||||
|
for (int w = 0; w < W; ++w) {
|
||||||
|
pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos_embed_2d;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -1821,10 +2059,10 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||||||
clip_image_f32_batch imgs{};
|
clip_image_f32_batch imgs{};
|
||||||
imgs.size = 1;
|
imgs.size = 1;
|
||||||
imgs.data = img;
|
imgs.data = img;
|
||||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
|
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -1834,6 +2072,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
if (ctx->has_llava_projector) {
|
if (ctx->has_llava_projector) {
|
||||||
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
GGML_ASSERT(batch_size == 1); // TODO: support multiple images
|
||||||
}
|
}
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
GGML_ASSERT(batch_size == 1);
|
||||||
|
}
|
||||||
|
|
||||||
// build the inference graph
|
// build the inference graph
|
||||||
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
||||||
@ -1844,8 +2085,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
|
int image_size_width = image_size;
|
||||||
|
int image_size_height = image_size;
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
image_size_width = load_image_size.first;
|
||||||
|
image_size_height = load_image_size.second;
|
||||||
|
}
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -1855,7 +2102,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
for (size_t i = 0; i < imgs->size; i++) {
|
for (size_t i = 0; i < imgs->size; i++) {
|
||||||
const int nx = imgs->data[i].nx;
|
const int nx = imgs->data[i].nx;
|
||||||
const int ny = imgs->data[i].ny;
|
const int ny = imgs->data[i].ny;
|
||||||
|
if (!ctx->has_minicpmv_projector) {
|
||||||
GGML_ASSERT(nx == image_size && ny == image_size);
|
GGML_ASSERT(nx == image_size && ny == image_size);
|
||||||
|
}
|
||||||
|
|
||||||
const int n = nx * ny;
|
const int n = nx * ny;
|
||||||
|
|
||||||
@ -1872,7 +2121,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
|
||||||
free(data);
|
free(data);
|
||||||
}
|
}
|
||||||
|
if (ctx->has_minicpmv_projector) {
|
||||||
|
{
|
||||||
|
// inspired from siglip:
|
||||||
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
|
||||||
|
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
|
||||||
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
||||||
|
int* positions_data = (int*)malloc(ggml_nbytes(positions));
|
||||||
|
for (int i = 0; i < num_positions; i++) {
|
||||||
|
positions_data[i] = std::floor(70.0*i/num_positions);
|
||||||
|
}
|
||||||
|
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
|
||||||
|
free(positions_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// inspired from resampler of Qwen-VL:
|
||||||
|
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
|
||||||
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
|
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||||
|
int pos_w = image_size_width/patch_size;
|
||||||
|
int pos_h = image_size_height/patch_size;
|
||||||
|
int embed_dim = 4096;
|
||||||
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
|
|
||||||
|
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
|
||||||
|
for(int i=0;i<pos_w * pos_h;++i){
|
||||||
|
for(int j=0;j<embed_dim;++j){
|
||||||
|
pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
|
||||||
|
free(pos_embed_data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else{
|
||||||
{
|
{
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||||
@ -1904,6 +2189,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
|
||||||
free(patches_data);
|
free(patches_data);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (ggml_backend_is_cpu(ctx->backend)) {
|
if (ggml_backend_is_cpu(ctx->backend)) {
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
@ -2072,6 +2358,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
return ctx->vision_model.mm_3_b->ne[0];
|
return ctx->vision_model.mm_3_b->ne[0];
|
||||||
}
|
}
|
||||||
|
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
|
||||||
|
return 4096;
|
||||||
|
}
|
||||||
|
|
||||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||||
|
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
#ifdef LLAMA_SHARED
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
@ -36,7 +37,7 @@ struct clip_image_f32_batch {
|
|||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair<int, int> load_image_size = {448, 448});
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
@ -71,10 +72,12 @@ CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t byt
|
|||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
|
CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst);
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
|
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size = {448, 448});
|
||||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
|
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size = {448, 448});
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
|
@ -409,3 +409,342 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
|||||||
free(embed->embed);
|
free(embed->embed);
|
||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
||||||
|
// std::vector<clip_image_f32*> img_res_v;
|
||||||
|
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
||||||
|
clip_image_f32 * img_res_v = clip_image_f32_init();
|
||||||
|
std::pair<int, int> load_image_size;
|
||||||
|
load_image_size.first = img->nx;
|
||||||
|
load_image_size.second = img->ny;
|
||||||
|
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
|
||||||
|
|
||||||
|
const int64_t t_img_enc_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
||||||
|
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
|
||||||
|
|
||||||
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
|
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096
|
||||||
|
if (!encoded) {
|
||||||
|
LOG_TEE("Unable to encode image\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||||
|
|
||||||
|
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||||
|
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||||
|
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int ensure_divide(int length, int patch_size) {
|
||||||
|
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||||
|
int width = original_size.first;
|
||||||
|
int height = original_size.second;
|
||||||
|
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
||||||
|
float r = static_cast<float>(width) / height;
|
||||||
|
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
||||||
|
width = static_cast<int>(height * r);
|
||||||
|
}
|
||||||
|
int best_width = ensure_divide(width, patch_size);
|
||||||
|
int best_height = ensure_divide(height, patch_size);
|
||||||
|
return std::make_pair(best_width, best_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||||
|
int width, height;
|
||||||
|
std::tie(width, height) = original_size;
|
||||||
|
int grid_x, grid_y;
|
||||||
|
std::tie(grid_x, grid_y) = grid;
|
||||||
|
|
||||||
|
int refine_width = ensure_divide(width, grid_x);
|
||||||
|
int refine_height = ensure_divide(height, grid_y);
|
||||||
|
|
||||||
|
int grid_width = refine_width / grid_x;
|
||||||
|
int grid_height = refine_height / grid_y;
|
||||||
|
|
||||||
|
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
||||||
|
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
||||||
|
int best_grid_width, best_grid_height;
|
||||||
|
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
||||||
|
|
||||||
|
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
||||||
|
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
||||||
|
return refine_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int clip(int x, int lower, int upper) {
|
||||||
|
return std::max(lower, std::min(x, upper));
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
|
||||||
|
const int nx = img.nx;
|
||||||
|
const int ny = img.ny;
|
||||||
|
|
||||||
|
dst.nx = target_width;
|
||||||
|
dst.ny = target_height;
|
||||||
|
dst.buf.resize(3 * target_width * target_height);
|
||||||
|
|
||||||
|
float Cc;
|
||||||
|
float C[5];
|
||||||
|
float d0, d2, d3, a0, a1, a2, a3;
|
||||||
|
int i, j, k, jj;
|
||||||
|
int x, y;
|
||||||
|
float dx, dy;
|
||||||
|
float tx, ty;
|
||||||
|
|
||||||
|
tx = (float)nx / (float)target_width;
|
||||||
|
ty = (float)ny / (float)target_height;
|
||||||
|
|
||||||
|
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
||||||
|
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
||||||
|
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
||||||
|
|
||||||
|
for (i = 0; i < target_height; i++) {
|
||||||
|
for (j = 0; j < target_width; j++) {
|
||||||
|
x = (int)(tx * j);
|
||||||
|
y = (int)(ty * i);
|
||||||
|
|
||||||
|
dx = tx * j - x;
|
||||||
|
dy = ty * i - y;
|
||||||
|
|
||||||
|
for (k = 0; k < 3; k++) {
|
||||||
|
for (jj = 0; jj <= 3; jj++) {
|
||||||
|
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||||
|
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||||
|
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||||
|
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
||||||
|
|
||||||
|
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||||
|
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||||
|
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||||
|
|
||||||
|
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
||||||
|
|
||||||
|
d0 = C[0] - C[1];
|
||||||
|
d2 = C[2] - C[1];
|
||||||
|
d3 = C[3] - C[1];
|
||||||
|
a0 = C[1];
|
||||||
|
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
||||||
|
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
||||||
|
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
||||||
|
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
||||||
|
|
||||||
|
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
||||||
|
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// inspired from LLaVA-UHD:
|
||||||
|
// -> https://arxiv.org/pdf/2403.11703
|
||||||
|
// -> https://github.com/thunlp/LLaVA-UHD
|
||||||
|
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||||
|
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
||||||
|
const std::pair<int, int> original_size={img->nx,img->ny};
|
||||||
|
const int original_width = img->nx;
|
||||||
|
const int original_height = img->ny;
|
||||||
|
const float log_ratio = log(1.0*original_width/original_height); //
|
||||||
|
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||||
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
|
std::vector<std::vector<clip_image_u8 *>> images;
|
||||||
|
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||||
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
|
||||||
|
if(multiple <= 1){
|
||||||
|
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
||||||
|
clip_image_u8 *source_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
|
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
||||||
|
images[images.size()-1].push_back(source_image);
|
||||||
|
}
|
||||||
|
else if(multiple > 1){
|
||||||
|
|
||||||
|
std::vector<int> candidate_split_grids_nums;
|
||||||
|
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||||
|
if (i == 1 || i > max_slice_nums) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
candidate_split_grids_nums.push_back(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
||||||
|
clip_image_u8 *source_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
|
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||||
|
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||||
|
images[images.size()-1].push_back(source_image);
|
||||||
|
|
||||||
|
std::vector<std::pair<int, int>> candidate_grids;
|
||||||
|
|
||||||
|
for (int split_grids_nums : candidate_split_grids_nums) {
|
||||||
|
int m = 1;
|
||||||
|
while (m <= split_grids_nums) {
|
||||||
|
if (split_grids_nums % m == 0) {
|
||||||
|
candidate_grids.emplace_back(m, split_grids_nums / m);
|
||||||
|
}
|
||||||
|
++m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<int, int> best_grid{1, 1};
|
||||||
|
float min_error = std::numeric_limits<float>::infinity();
|
||||||
|
|
||||||
|
for (const auto& grid : candidate_grids) {
|
||||||
|
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
||||||
|
if (error < min_error) {
|
||||||
|
best_grid = grid;
|
||||||
|
min_error = error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||||
|
|
||||||
|
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||||
|
clip_image_u8 *refine_image = clip_image_u8_init();
|
||||||
|
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
|
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
|
// split_to_patches
|
||||||
|
int width = refine_image->nx;
|
||||||
|
int height = refine_image->ny;
|
||||||
|
int grid_x = int(width / best_grid.first);
|
||||||
|
int grid_y = int(height / best_grid.second);
|
||||||
|
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
||||||
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
||||||
|
clip_image_u8 * patch = clip_image_u8_init();
|
||||||
|
patch->nx = grid_x;
|
||||||
|
patch->ny = grid_y;
|
||||||
|
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||||
|
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
||||||
|
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
||||||
|
const int i = 3 * (y * refine_image->nx + x);
|
||||||
|
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
||||||
|
patch->buf[j] = refine_image->buf[i];
|
||||||
|
patch->buf[j+1] = refine_image->buf[i+1];
|
||||||
|
patch->buf[j+2] = refine_image->buf[i+2];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
images[images.size()-1].push_back(patch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) {
|
||||||
|
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
|
||||||
|
for (size_t i = 0; i < imgs.size(); ++i){
|
||||||
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
|
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
struct uhd_image_embed * results = new uhd_image_embed();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < imgs.size(); ++i){
|
||||||
|
results->image_embeds.push_back(std::vector<llava_image_embed *>());
|
||||||
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
|
float* image_embed = NULL;
|
||||||
|
int n_image_pos = 0;
|
||||||
|
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
|
||||||
|
if (!image_embed_result) {
|
||||||
|
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
||||||
|
result->embed = image_embed;
|
||||||
|
result->n_image_pos = n_image_pos;
|
||||||
|
results->image_embeds[i].push_back(result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||||
|
auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
|
||||||
|
auto image_embed_slices = embeds->image_embeds;
|
||||||
|
if (!image_embed_slices[0][0]){
|
||||||
|
LOG_TEE("%s: failed to embeding image\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw";
|
||||||
|
unsigned char* slice_token;
|
||||||
|
long image_bytes_length;
|
||||||
|
auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length);
|
||||||
|
if (!loaded) {
|
||||||
|
LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61);
|
||||||
|
int all_n_img_pos=0;
|
||||||
|
int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float);
|
||||||
|
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len);
|
||||||
|
all_n_img_pos+=clip_n_patches(ctx_clip);
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
||||||
|
if (image_embed_slices.size() > 1) {
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len);
|
||||||
|
for (size_t i = 1; i < image_embed_slices.size(); ++i) {
|
||||||
|
for (size_t j = 0; j < image_embed_slices[i].size(); ++j) {
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len);
|
||||||
|
all_n_img_pos+=clip_n_patches(ctx_clip);
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
||||||
|
if (j == image_embed_slices[i].size() - 1) {
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len);
|
||||||
|
}
|
||||||
|
*image_embd_out = all_image_embd;
|
||||||
|
*n_img_pos_out = all_n_img_pos;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
||||||
|
unsigned char* image_bytes;
|
||||||
|
long image_bytes_length;
|
||||||
|
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||||
|
if (!loaded) {
|
||||||
|
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
clip_image_u8 * img = clip_image_u8_init();
|
||||||
|
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||||
|
clip_image_u8_free(img);
|
||||||
|
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
|
||||||
|
|
||||||
|
clip_image_u8_free(img);
|
||||||
|
free(image_bytes);
|
||||||
|
return embeds;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llava_image_embed_free_uhd(struct uhd_image_embed * embed) {
|
||||||
|
for (size_t i = 0; i < embed->image_embeds.size(); ++i){
|
||||||
|
for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){
|
||||||
|
free(embed->image_embeds[i][j]->embed);
|
||||||
|
free(embed->image_embeds[i][j]);
|
||||||
|
}
|
||||||
|
embed->image_embeds[i] = std::vector<struct llava_image_embed *>();
|
||||||
|
}
|
||||||
|
embed->image_embeds = std::vector<std::vector<struct llava_image_embed *>>();
|
||||||
|
}
|
@ -19,6 +19,10 @@
|
|||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
|
struct uhd_image_embed {
|
||||||
|
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@ -40,6 +44,13 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
|
|||||||
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
||||||
/** free an embedding made with llava_image_embed_make_* */
|
/** free an embedding made with llava_image_embed_make_* */
|
||||||
|
|
||||||
|
/** build an image embed from image file bytes */
|
||||||
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
||||||
|
/** build an image embed from a path to an image filename */
|
||||||
|
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||||
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
|
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
||||||
|
|
||||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
||||||
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "minicpmv.h"
|
#include "llava.h"
|
||||||
#include "minicpmv_wrapper.h"
|
#include "minicpmv_wrapper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
@ -1,7 +1,7 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "minicpmv.h"
|
#include "llava.h"
|
||||||
#include "minicpmv_wrapper.h"
|
#include "minicpmv_wrapper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "minicpmv.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
#ifdef LLAMA_SHARED
|
@ -1,3 +1,4 @@
|
|||||||
-r ../../requirements/requirements-convert-legacy-llama.txt
|
-r ../../requirements/requirements-convert-legacy-llama.txt
|
||||||
pillow~=10.2.0
|
pillow~=10.2.0
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
|
torchvision==0.16.2
|
@ -1,42 +0,0 @@
|
|||||||
add_library(minicpmv OBJECT
|
|
||||||
minicpmv.cpp
|
|
||||||
minicpmv.h
|
|
||||||
clip.cpp
|
|
||||||
clip.h
|
|
||||||
)
|
|
||||||
|
|
||||||
target_link_libraries(minicpmv PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
|
|
||||||
target_include_directories(minicpmv PUBLIC .)
|
|
||||||
target_include_directories(minicpmv PUBLIC ../..)
|
|
||||||
target_include_directories(minicpmv PUBLIC ../../common)
|
|
||||||
|
|
||||||
target_compile_features(minicpmv PRIVATE cxx_std_11)
|
|
||||||
|
|
||||||
add_library(minicpmv_static STATIC $<TARGET_OBJECTS:minicpmv>)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set_target_properties(minicpmv PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
||||||
target_compile_definitions(minicpmv PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
|
||||||
add_library(minicpmv_shared SHARED $<TARGET_OBJECTS:minicpmv>)
|
|
||||||
target_link_libraries(minicpmv_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
install(TARGETS minicpmv_shared LIBRARY)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
target_compile_options(minicpmv PRIVATE -Wno-cast-qual) # stb_image.h
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(TARGET BUILD_INFO)
|
|
||||||
add_dependencies(minicpmv BUILD_INFO)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(TARGET minicpmv-cli)
|
|
||||||
add_executable(minicpmv-cli minicpmv-cli.cpp)
|
|
||||||
install(TARGETS minicpmv-cli RUNTIME)
|
|
||||||
target_link_libraries(minicpmv-cli PRIVATE common minicpmv_wrapper minicpmv ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(minicpmv PRIVATE cxx_std_11)
|
|
||||||
|
|
||||||
add_library(minicpmv_wrapper OBJECT
|
|
||||||
minicpmv_wrapper.cpp
|
|
||||||
)
|
|
||||||
target_link_libraries(minicpmv_wrapper PRIVATE minicpmv ${CMAKE_THREAD_LIBS_INIT})
|
|
File diff suppressed because it is too large
Load Diff
@ -1,85 +0,0 @@
|
|||||||
#ifndef CLIP_H
|
|
||||||
#define CLIP_H
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define CLIP_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define CLIP_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define CLIP_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define CLIP_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
|
|
||||||
struct clip_image_u8_batch {
|
|
||||||
struct clip_image_u8 * data;
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_f32_batch {
|
|
||||||
struct clip_image_f32 * data;
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair<int, int> load_image_size);
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
|
|
||||||
CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
|
|
||||||
CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
// TODO: should be enum, not string
|
|
||||||
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
|
||||||
|
|
||||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
|
||||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
|
||||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
|
||||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
|
||||||
|
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
||||||
|
|
||||||
CLIP_API void normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_image_u8* src, clip_image_f32* dst);
|
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size);
|
|
||||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size);
|
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // CLIP_H
|
|
@ -1,452 +0,0 @@
|
|||||||
#include "clip.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
#include "minicpmv.h"
|
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <vector>
|
|
||||||
#include <numeric>
|
|
||||||
|
|
||||||
// RGB uint8 image
|
|
||||||
struct clip_image_u8 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<uint8_t> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
// RGB float32 image (NHWC)
|
|
||||||
// Memory layout: RGBRGBRGB...
|
|
||||||
struct clip_image_f32 {
|
|
||||||
int nx;
|
|
||||||
int ny;
|
|
||||||
|
|
||||||
std::vector<float> buf;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct clip_image_grid_shape {
|
|
||||||
int first;
|
|
||||||
int second;
|
|
||||||
};
|
|
||||||
|
|
||||||
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
|
|
||||||
// std::vector<clip_image_f32*> img_res_v;
|
|
||||||
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
|
||||||
clip_image_f32 * img_res_v = clip_image_f32_init();
|
|
||||||
std::pair<int, int> load_image_size;
|
|
||||||
load_image_size.first = img->nx;
|
|
||||||
load_image_size.second = img->ny;
|
|
||||||
normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
|
|
||||||
|
|
||||||
const int64_t t_img_enc_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
|
|
||||||
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
|
|
||||||
|
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd, load_image_size); // image_embd shape is 96 x 4096
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_TEE("Unable to encode image\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
|
||||||
|
|
||||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
|
||||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
|
||||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
|
|
||||||
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
|
||||||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
||||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
|
||||||
if (n_image_embd != n_llama_embd) {
|
|
||||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6);
|
|
||||||
if (!image_embd) {
|
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n_img_pos;
|
|
||||||
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
|
||||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
|
||||||
free(image_embd);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*image_embd_out = image_embd;
|
|
||||||
*n_img_pos_out = n_img_pos;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
|
|
||||||
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
|
|
||||||
|
|
||||||
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
|
|
||||||
int n_eval = image_embed->n_image_pos - i;
|
|
||||||
if (n_eval > n_batch) {
|
|
||||||
n_eval = n_batch;
|
|
||||||
}
|
|
||||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
|
||||||
if (llama_decode(ctx_llama, batch)) {
|
|
||||||
LOG_TEE("%s : failed to eval\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*n_past += n_eval;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int ensure_divide(int length, int patch_size) {
|
|
||||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
||||||
int width = original_size.first;
|
|
||||||
int height = original_size.second;
|
|
||||||
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
|
||||||
float r = static_cast<float>(width) / height;
|
|
||||||
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
|
||||||
width = static_cast<int>(height * r);
|
|
||||||
}
|
|
||||||
int best_width = ensure_divide(width, patch_size);
|
|
||||||
int best_height = ensure_divide(height, patch_size);
|
|
||||||
return std::make_pair(best_width, best_height);
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
|
||||||
int width, height;
|
|
||||||
std::tie(width, height) = original_size;
|
|
||||||
int grid_x, grid_y;
|
|
||||||
std::tie(grid_x, grid_y) = grid;
|
|
||||||
|
|
||||||
int refine_width = ensure_divide(width, grid_x);
|
|
||||||
int refine_height = ensure_divide(height, grid_y);
|
|
||||||
|
|
||||||
int grid_width = refine_width / grid_x;
|
|
||||||
int grid_height = refine_height / grid_y;
|
|
||||||
|
|
||||||
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
|
||||||
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
|
||||||
int best_grid_width, best_grid_height;
|
|
||||||
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
|
||||||
|
|
||||||
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
|
||||||
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
|
||||||
return refine_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int clip(int x, int lower, int upper) {
|
|
||||||
return std::max(lower, std::min(x, upper));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
|
|
||||||
const int nx = img.nx;
|
|
||||||
const int ny = img.ny;
|
|
||||||
|
|
||||||
dst.nx = target_width;
|
|
||||||
dst.ny = target_height;
|
|
||||||
dst.buf.resize(3 * target_width * target_height);
|
|
||||||
|
|
||||||
float Cc;
|
|
||||||
float C[5];
|
|
||||||
float d0, d2, d3, a0, a1, a2, a3;
|
|
||||||
int i, j, k, jj;
|
|
||||||
int x, y;
|
|
||||||
float dx, dy;
|
|
||||||
float tx, ty;
|
|
||||||
|
|
||||||
tx = (float)nx / (float)target_width;
|
|
||||||
ty = (float)ny / (float)target_height;
|
|
||||||
|
|
||||||
// Bicubic interpolation; adapted from ViT.cpp, inspired from :
|
|
||||||
// -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
|
|
||||||
// -> https://en.wikipedia.org/wiki/Bicubic_interpolation
|
|
||||||
|
|
||||||
for (i = 0; i < target_height; i++) {
|
|
||||||
for (j = 0; j < target_width; j++) {
|
|
||||||
x = (int)(tx * j);
|
|
||||||
y = (int)(ty * i);
|
|
||||||
|
|
||||||
dx = tx * j - x;
|
|
||||||
dy = ty * i - y;
|
|
||||||
|
|
||||||
for (k = 0; k < 3; k++) {
|
|
||||||
for (jj = 0; jj <= 3; jj++) {
|
|
||||||
d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
||||||
d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
||||||
d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
||||||
a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
|
|
||||||
|
|
||||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
||||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
||||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
||||||
|
|
||||||
C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
|
|
||||||
|
|
||||||
d0 = C[0] - C[1];
|
|
||||||
d2 = C[2] - C[1];
|
|
||||||
d3 = C[3] - C[1];
|
|
||||||
a0 = C[1];
|
|
||||||
a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
|
|
||||||
a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
|
|
||||||
a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
|
|
||||||
Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
|
|
||||||
|
|
||||||
const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
|
|
||||||
dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// inspired from LLaVA-UHD:
|
|
||||||
// -> https://arxiv.org/pdf/2403.11703
|
|
||||||
// -> https://github.com/thunlp/LLaVA-UHD
|
|
||||||
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
|
||||||
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
|
||||||
const std::pair<int, int> original_size={img->nx,img->ny};
|
|
||||||
const int original_width = img->nx;
|
|
||||||
const int original_height = img->ny;
|
|
||||||
const float log_ratio = log(1.0*original_width/original_height); //
|
|
||||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
|
||||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
|
||||||
|
|
||||||
std::vector<std::vector<clip_image_u8 *>> images;
|
|
||||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
|
||||||
images.push_back(std::vector<clip_image_u8 *>());
|
|
||||||
|
|
||||||
if(multiple <= 1){
|
|
||||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
|
||||||
clip_image_u8 *source_image = clip_image_u8_init();
|
|
||||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
||||||
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
|
||||||
images[images.size()-1].push_back(source_image);
|
|
||||||
}
|
|
||||||
else if(multiple > 1){
|
|
||||||
|
|
||||||
std::vector<int> candidate_split_grids_nums;
|
|
||||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
|
||||||
if (i == 1 || i > max_slice_nums) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
candidate_split_grids_nums.push_back(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
|
||||||
clip_image_u8 *source_image = clip_image_u8_init();
|
|
||||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
|
||||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
|
||||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
|
||||||
images[images.size()-1].push_back(source_image);
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> candidate_grids;
|
|
||||||
|
|
||||||
for (int split_grids_nums : candidate_split_grids_nums) {
|
|
||||||
int m = 1;
|
|
||||||
while (m <= split_grids_nums) {
|
|
||||||
if (split_grids_nums % m == 0) {
|
|
||||||
candidate_grids.emplace_back(m, split_grids_nums / m);
|
|
||||||
}
|
|
||||||
++m;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::pair<int, int> best_grid{1, 1};
|
|
||||||
float min_error = std::numeric_limits<float>::infinity();
|
|
||||||
|
|
||||||
for (const auto& grid : candidate_grids) {
|
|
||||||
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
|
||||||
if (error < min_error) {
|
|
||||||
best_grid = grid;
|
|
||||||
min_error = error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
|
||||||
|
|
||||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
|
||||||
clip_image_u8 *refine_image = clip_image_u8_init();
|
|
||||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
|
||||||
|
|
||||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
|
||||||
|
|
||||||
// split_to_patches
|
|
||||||
int width = refine_image->nx;
|
|
||||||
int height = refine_image->ny;
|
|
||||||
int grid_x = int(width / best_grid.first);
|
|
||||||
int grid_y = int(height / best_grid.second);
|
|
||||||
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
|
||||||
images.push_back(std::vector<clip_image_u8 *>());
|
|
||||||
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
|
||||||
clip_image_u8 * patch = clip_image_u8_init();
|
|
||||||
patch->nx = grid_x;
|
|
||||||
patch->ny = grid_y;
|
|
||||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
|
||||||
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
|
||||||
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
|
||||||
const int i = 3 * (y * refine_image->nx + x);
|
|
||||||
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
|
||||||
patch->buf[j] = refine_image->buf[i];
|
|
||||||
patch->buf[j+1] = refine_image->buf[i+1];
|
|
||||||
patch->buf[j+2] = refine_image->buf[i+2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
images[images.size()-1].push_back(patch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return images;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) {
|
|
||||||
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
|
|
||||||
for (size_t i = 0; i < imgs.size(); ++i){
|
|
||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
||||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
struct uhd_image_embed * results = new uhd_image_embed();
|
|
||||||
|
|
||||||
for (size_t i = 0; i < imgs.size(); ++i){
|
|
||||||
results->image_embeds.push_back(std::vector<llava_image_embed *>());
|
|
||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
|
||||||
float* image_embed = NULL;
|
|
||||||
int n_image_pos = 0;
|
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, imgs[i][j], &image_embed, &n_image_pos);
|
|
||||||
if (!image_embed_result) {
|
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
|
||||||
result->embed = image_embed;
|
|
||||||
result->n_image_pos = n_image_pos;
|
|
||||||
results->image_embeds[i].push_back(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
|
||||||
auto file = fopen(path, "rb");
|
|
||||||
if (file == NULL) {
|
|
||||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
fseek(file, 0, SEEK_END);
|
|
||||||
auto fileSize = ftell(file);
|
|
||||||
fseek(file, 0, SEEK_SET);
|
|
||||||
|
|
||||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
|
||||||
if (buffer == NULL) {
|
|
||||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
|
||||||
perror("Memory allocation error");
|
|
||||||
fclose(file);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
errno = 0;
|
|
||||||
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
|
||||||
if (ferror(file)) {
|
|
||||||
die_fmt("read error: %s", strerror(errno));
|
|
||||||
}
|
|
||||||
if (ret != (size_t) fileSize) {
|
|
||||||
die("unexpectedly reached end of file");
|
|
||||||
}
|
|
||||||
fclose(file); // Close the file
|
|
||||||
|
|
||||||
*bytesOut = buffer;
|
|
||||||
*sizeOut = fileSize;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
||||||
auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
|
|
||||||
auto image_embed_slices = embeds->image_embeds;
|
|
||||||
if (!image_embed_slices[0][0]){
|
|
||||||
LOG_TEE("%s: failed to embeding image\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw";
|
|
||||||
unsigned char* slice_token;
|
|
||||||
long image_bytes_length;
|
|
||||||
auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length);
|
|
||||||
if (!loaded) {
|
|
||||||
LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61);
|
|
||||||
int all_n_img_pos=0;
|
|
||||||
int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float);
|
|
||||||
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len);
|
|
||||||
all_n_img_pos+=clip_n_patches(ctx_clip);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
|
||||||
if (image_embed_slices.size() > 1) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len);
|
|
||||||
for (size_t i = 1; i < image_embed_slices.size(); ++i) {
|
|
||||||
for (size_t j = 0; j < image_embed_slices[i].size(); ++j) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len);
|
|
||||||
all_n_img_pos+=clip_n_patches(ctx_clip);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
|
||||||
if (j == image_embed_slices[i].size() - 1) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len);
|
|
||||||
}
|
|
||||||
*image_embd_out = all_image_embd;
|
|
||||||
*n_img_pos_out = all_n_img_pos;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
|
||||||
unsigned char* image_bytes;
|
|
||||||
long image_bytes_length;
|
|
||||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
|
||||||
if (!loaded) {
|
|
||||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
clip_image_u8 * img = clip_image_u8_init();
|
|
||||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
|
||||||
clip_image_u8_free(img);
|
|
||||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
|
|
||||||
|
|
||||||
clip_image_u8_free(img);
|
|
||||||
free(image_bytes);
|
|
||||||
return embeds;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llava_image_embed_free_uhd(struct uhd_image_embed * embed) {
|
|
||||||
for (size_t i = 0; i < embed->image_embeds.size(); ++i){
|
|
||||||
for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){
|
|
||||||
free(embed->image_embeds[i][j]->embed);
|
|
||||||
free(embed->image_embeds[i][j]);
|
|
||||||
}
|
|
||||||
embed->image_embeds[i] = std::vector<struct llava_image_embed *>();
|
|
||||||
}
|
|
||||||
embed->image_embeds = std::vector<std::vector<struct llava_image_embed *>>();
|
|
||||||
}
|
|
@ -1,54 +0,0 @@
|
|||||||
#ifndef LLAVA_H
|
|
||||||
#define LLAVA_H
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#ifdef LLAMA_SHARED
|
|
||||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
|
||||||
# ifdef LLAMA_BUILD
|
|
||||||
# define MINICPMV_API __declspec(dllexport)
|
|
||||||
# else
|
|
||||||
# define MINICPMV_API __declspec(dllimport)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define MINICPMV_API __attribute__ ((visibility ("default")))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define MINICPMV_API
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct clip_ctx;
|
|
||||||
struct uhd_image_embed {
|
|
||||||
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct llava_image_embed {
|
|
||||||
float * embed;
|
|
||||||
int n_image_pos;
|
|
||||||
};
|
|
||||||
|
|
||||||
/** sanity check for clip <-> llava embed size match */
|
|
||||||
MINICPMV_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
|
||||||
|
|
||||||
MINICPMV_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
|
||||||
|
|
||||||
/** build an image embed from image file bytes */
|
|
||||||
MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
|
||||||
/** build an image embed from a path to an image filename */
|
|
||||||
MINICPMV_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
|
||||||
MINICPMV_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
|
||||||
MINICPMV_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
|
||||||
/** free an embedding made with llava_image_embed_make_* */
|
|
||||||
|
|
||||||
/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
|
|
||||||
MINICPMV_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
@ -1,4 +0,0 @@
|
|||||||
-r ../../requirements/requirements-convert.txt
|
|
||||||
pillow~=10.2.0
|
|
||||||
torch~=2.1.1
|
|
||||||
torchvision==0.16.2
|
|
Loading…
Reference in New Issue
Block a user