mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 11:40:17 +00:00
fix code review
This commit is contained in:
parent
292a46906d
commit
be8b5b2f8d
6
Makefile
6
Makefile
@ -950,12 +950,12 @@ llama-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/lla
|
|||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv_wrapper.h examples/llava/minicpmv_wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp examples/llava/minicpmv-wrapper.h examples/llava/minicpmv-wrapper.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp)
|
||||||
$(CXX) $(CXXFLAGS) -c examples/llava/minicpmv_wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp)
|
$(CXX) $(CXXFLAGS) -c examples/llava/minicpmv-wrapper.cpp -o $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv_wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv_wrapper.cpp) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp examples/llava/minicpmv-wrapper.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) $(call GET_OBJ_FILE, examples/llava/minicpmv-wrapper.cpp) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -44,7 +44,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
|||||||
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
||||||
add_library(minicpmv_wrapper OBJECT
|
add_library(minicpmv-wrapper OBJECT
|
||||||
minicpmv_wrapper.cpp
|
minicpmv-wrapper.cpp
|
||||||
)
|
)
|
||||||
target_link_libraries(minicpmv_wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(minicpmv-wrapper PRIVATE llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
@ -77,7 +77,7 @@ static std::string format(const char * fmt, ...) {
|
|||||||
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
#define KEY_HAS_TEXT_ENC "clip.has_text_encoder"
|
||||||
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
|
||||||
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
|
||||||
#define KEY_HAS_MiniCPMV_PROJ "clip.has_minicpmv_projector"
|
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
|
||||||
#define KEY_USE_GELU "clip.use_gelu"
|
#define KEY_USE_GELU "clip.use_gelu"
|
||||||
#define KEY_N_EMBD "clip.%s.embedding_length"
|
#define KEY_N_EMBD "clip.%s.embedding_length"
|
||||||
#define KEY_N_FF "clip.%s.feed_forward_length"
|
#define KEY_N_FF "clip.%s.feed_forward_length"
|
||||||
@ -124,8 +124,7 @@ static std::string format(const char * fmt, ...) {
|
|||||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||||
// MINICPMV
|
|
||||||
// #define TN_MINICPMV_POS_EMBD "resampler.pos_embed"
|
|
||||||
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
|
||||||
#define TN_MINICPMV_QUERY "resampler.query"
|
#define TN_MINICPMV_QUERY "resampler.query"
|
||||||
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
#define TN_MINICPMV_PROJ "resampler.proj.weight"
|
||||||
@ -502,7 +501,6 @@ struct clip_vision_model {
|
|||||||
struct ggml_tensor * mm_model_peg_0_b;
|
struct ggml_tensor * mm_model_peg_0_b;
|
||||||
|
|
||||||
// MINICPMV projection
|
// MINICPMV projection
|
||||||
// struct ggml_tensor * mm_model_pos_embed;
|
|
||||||
struct ggml_tensor * mm_model_pos_embed_k;
|
struct ggml_tensor * mm_model_pos_embed_k;
|
||||||
struct ggml_tensor * mm_model_query;
|
struct ggml_tensor * mm_model_query;
|
||||||
struct ggml_tensor * mm_model_proj;
|
struct ggml_tensor * mm_model_proj;
|
||||||
@ -554,7 +552,7 @@ struct clip_ctx {
|
|||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct load_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@ -568,11 +566,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
int image_size_height = image_size;
|
int image_size_height = image_size;
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
if(load_image_size==nullptr){
|
if(load_image_size==nullptr){
|
||||||
load_image_size= load_image_size_init();
|
load_image_size= clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
|
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
image_size_width = load_image_size->image_size_width;
|
image_size_width = load_image_size->width;
|
||||||
image_size_height = load_image_size->image_size_height;
|
image_size_height = load_image_size->height;
|
||||||
if (is_inf){
|
if (is_inf){
|
||||||
image_size_width = imgs->data->nx;
|
image_size_width = imgs->data->nx;
|
||||||
image_size_height = imgs->data->ny;
|
image_size_height = imgs->data->ny;
|
||||||
@ -926,7 +924,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
|
||||||
embeddings = peg_0;
|
embeddings = peg_0;
|
||||||
}
|
}
|
||||||
|
|
||||||
else {
|
else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
@ -999,7 +996,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct load_image_size * load_image_size) {
|
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct clip_image_size * load_image_size) {
|
||||||
struct ggml_context * meta = NULL;
|
struct ggml_context * meta = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
@ -1468,10 +1465,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
|
|||||||
return new_clip;
|
return new_clip;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct load_image_size * load_image_size_init() {
|
struct clip_image_size * clip_image_size_init() {
|
||||||
struct load_image_size * load_image_size = new struct load_image_size();
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
||||||
load_image_size->image_size_width = 448;
|
load_image_size->width = 448;
|
||||||
load_image_size->image_size_height = 448;
|
load_image_size->height = 448;
|
||||||
return load_image_size;
|
return load_image_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2069,7 +2066,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
|||||||
return pos_embed_2d;
|
return pos_embed_2d;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct load_image_size * load_image_size) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -2081,7 +2078,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||||||
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
|
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
@ -2103,7 +2100,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
const auto & model = ctx->vision_model;
|
const auto & model = ctx->vision_model;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
int image_size_width = image_size;
|
int image_size_width = image_size;
|
||||||
int image_size_height = image_size;
|
int image_size_height = image_size;
|
||||||
if (ctx->has_minicpmv_projector) {
|
if (ctx->has_minicpmv_projector) {
|
||||||
@ -2160,11 +2157,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||||||
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
|
||||||
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
|
||||||
if(load_image_size==nullptr){
|
if(load_image_size==nullptr){
|
||||||
load_image_size= load_image_size_init();
|
load_image_size= clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
|
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
int pos_w = load_image_size->image_size_width/patch_size;
|
int pos_w = load_image_size->width/patch_size;
|
||||||
int pos_h = load_image_size->image_size_height/patch_size;
|
int pos_h = load_image_size->height/patch_size;
|
||||||
int embed_dim = 4096;
|
int embed_dim = 4096;
|
||||||
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
|
||||||
|
|
||||||
|
@ -26,9 +26,9 @@ extern "C" {
|
|||||||
|
|
||||||
struct clip_ctx;
|
struct clip_ctx;
|
||||||
|
|
||||||
struct load_image_size {
|
struct clip_image_size {
|
||||||
int image_size_width;
|
int width;
|
||||||
int image_size_height;
|
int height;
|
||||||
};
|
};
|
||||||
struct clip_image_u8_batch {
|
struct clip_image_u8_batch {
|
||||||
struct clip_image_u8 * data;
|
struct clip_image_u8 * data;
|
||||||
@ -40,7 +40,7 @@ struct clip_image_f32_batch {
|
|||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct load_image_size * load_image_size = nullptr);
|
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct clip_image_size * load_image_size);
|
||||||
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
|
||||||
|
|
||||||
CLIP_API void clip_free(struct clip_ctx * ctx);
|
CLIP_API void clip_free(struct clip_ctx * ctx);
|
||||||
@ -59,7 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
|
|||||||
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
|
||||||
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API struct load_image_size * load_image_size_init();
|
CLIP_API struct clip_image_size * clip_image_size_init();
|
||||||
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
CLIP_API struct clip_image_u8 * clip_image_u8_init ();
|
||||||
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
|
|
||||||
@ -80,8 +80,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im
|
|||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct load_image_size * load_image_size = nullptr);
|
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct clip_image_size * load_image_size);
|
||||||
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size = nullptr);
|
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct clip_image_size * load_image_size);
|
||||||
|
|
||||||
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
|
||||||
|
|
||||||
|
@ -413,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
|
|||||||
free(embed);
|
free(embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct load_image_size * load_image_size) {
|
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct clip_image_size * load_image_size) {
|
||||||
// std::vector<clip_image_f32*> img_res_v;
|
// std::vector<clip_image_f32*> img_res_v;
|
||||||
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
// format VectN x H x W x RGB (N x 448 x 448 x 3)
|
||||||
clip_image_f32 * img_res_v = clip_image_f32_init();
|
clip_image_f32 * img_res_v = clip_image_f32_init();
|
||||||
@ -686,10 +686,10 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
float* image_embed = NULL;
|
float* image_embed = NULL;
|
||||||
int n_image_pos = 0;
|
int n_image_pos = 0;
|
||||||
int patch_size=14;
|
int patch_size=14;
|
||||||
struct load_image_size * load_image_size = load_image_size_init();
|
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||||
load_image_size->image_size_width = imgs[i][j]->nx;
|
load_image_size->width = imgs[i][j]->nx;
|
||||||
load_image_size->image_size_height = imgs[i][j]->ny;
|
load_image_size->height = imgs[i][j]->ny;
|
||||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
|
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
|
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
|
||||||
if (!image_embed_result) {
|
if (!image_embed_result) {
|
||||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||||
@ -705,7 +705,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size) {
|
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size) {
|
||||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||||
if (!image_embd) {
|
if (!image_embd) {
|
||||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||||
@ -724,50 +724,6 @@ bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llava_image_embed_make_with_clip_img_ollama(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
|
||||||
auto embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img);
|
|
||||||
auto image_embed_slices = embeds->image_embeds;
|
|
||||||
if (!image_embed_slices[0][0]){
|
|
||||||
LOG_TEE("%s: failed to embeding image\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
std::string fname = "./examples/minicpm-v2.5/slice_token_for_ollama.raw";
|
|
||||||
unsigned char* slice_token;
|
|
||||||
long image_bytes_length;
|
|
||||||
auto loaded = load_file_to_bytes(fname.c_str(), &slice_token, &image_bytes_length);
|
|
||||||
if (!loaded) {
|
|
||||||
LOG_TEE("%s: failed to load %s\n", __func__, fname.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
float * all_image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*61);
|
|
||||||
int all_n_img_pos=0;
|
|
||||||
int token_len = clip_n_mmproj_embd(ctx_clip)*sizeof(float);
|
|
||||||
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[0][0]->embed, 96*token_len);
|
|
||||||
all_n_img_pos+=clip_n_patches(ctx_clip);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
|
||||||
if (image_embed_slices.size() > 1) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*2, token_len);
|
|
||||||
for (size_t i = 1; i < image_embed_slices.size(); ++i) {
|
|
||||||
for (size_t j = 0; j < image_embed_slices[i].size(); ++j) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token, token_len);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos, image_embed_slices[i][j]->embed, 96*token_len);
|
|
||||||
all_n_img_pos+=clip_n_patches(ctx_clip);
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len, token_len);
|
|
||||||
if (j == image_embed_slices[i].size() - 1) {
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*4, token_len);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::memcpy(all_image_embd+token_len*all_n_img_pos++, slice_token+token_len*3, token_len);
|
|
||||||
}
|
|
||||||
*image_embd_out = all_image_embd;
|
|
||||||
*n_img_pos_out = all_n_img_pos;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
|
||||||
unsigned char* image_bytes;
|
unsigned char* image_bytes;
|
||||||
long image_bytes_length;
|
long image_bytes_length;
|
||||||
|
@ -45,8 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
|
|||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
|
||||||
/** build an image embed from a path to an image filename */
|
/** build an image embed from a path to an image filename */
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size = nullptr);
|
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct clip_image_size * load_image_size);
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
|
||||||
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
|
||||||
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "minicpmv_wrapper.h"
|
#include "minicpmv-wrapper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "minicpmv_wrapper.h"
|
#include "minicpmv-wrapper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -58,7 +58,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) {
|
|||||||
if (prompt.empty()) {
|
if (prompt.empty()) {
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
struct load_image_size * load_image_size = load_image_size_init();
|
struct clip_image_size * load_image_size = clip_image_size_init();
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size);
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size);
|
||||||
return ctx_clip;
|
return ctx_clip;
|
||||||
}
|
}
|
||||||
@ -99,8 +99,7 @@ bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
|||||||
bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
std::string str2 = str;
|
std::string str2 = str;
|
||||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
||||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void process_image(struct minicpmv_context * ctx_llava, std::vector<std::vector<struct llava_image_embed *>> image_embed_slices, gpt_params * params, int &n_past) {
|
void process_image(struct minicpmv_context * ctx_llava, std::vector<std::vector<struct llava_image_embed *>> image_embed_slices, gpt_params * params, int &n_past) {
|
Loading…
Reference in New Issue
Block a user