fix issues for merging

This commit is contained in:
caitianchi 2024-07-17 15:04:25 +08:00
parent 3e6348b8dc
commit c5b68515f0
8 changed files with 48 additions and 24 deletions

1
.gitignore vendored
View File

@ -117,7 +117,6 @@ poetry.toml
/tests/test-tokenizer-0 /tests/test-tokenizer-0
/tests/test-tokenizer-1-bpe /tests/test-tokenizer-1-bpe
/tests/test-tokenizer-1-spm /tests/test-tokenizer-1-spm
/openbmb
# Scripts # Scripts
!/scripts/install-oneapi.bat !/scripts/install-oneapi.bat

Binary file not shown.

Before

Width:  |  Height:  |  Size: 304 KiB

View File

@ -554,7 +554,7 @@ struct clip_ctx {
ggml_gallocr_t compute_alloc = NULL; ggml_gallocr_t compute_alloc = NULL;
}; };
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, std::pair<int, int> load_image_size = {448, 448}, bool is_inf = false) { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct load_image_size * load_image_size, bool is_inf = false) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return nullptr; return nullptr;
@ -567,8 +567,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
int image_size_width = image_size; int image_size_width = image_size;
int image_size_height = image_size; int image_size_height = image_size;
if (ctx->has_minicpmv_projector) { if (ctx->has_minicpmv_projector) {
image_size_width = load_image_size.first; if(load_image_size==nullptr){
image_size_height = load_image_size.second; load_image_size= load_image_size_init();
}
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
image_size_width = load_image_size->image_size_width;
image_size_height = load_image_size->image_size_height;
if (is_inf){ if (is_inf){
image_size_width = imgs->data->nx; image_size_width = imgs->data->nx;
image_size_height = imgs->data->ny; image_size_height = imgs->data->ny;
@ -995,7 +999,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, std::pair<int, int> load_image_size) { struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, struct load_image_size * load_image_size) {
struct ggml_context * meta = NULL; struct ggml_context * meta = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
@ -1464,6 +1468,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1, s
return new_clip; return new_clip;
} }
struct load_image_size * load_image_size_init() {
struct load_image_size * load_image_size = new struct load_image_size();
load_image_size->image_size_width = 448;
load_image_size->image_size_height = 448;
return load_image_size;
}
struct clip_image_u8 * clip_image_u8_init() { struct clip_image_u8 * clip_image_u8_init() {
return new clip_image_u8(); return new clip_image_u8();
} }
@ -2058,7 +2069,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
return pos_embed_2d; return pos_embed_2d;
} }
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec, struct load_image_size * load_image_size) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -2070,7 +2081,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size); return clip_image_batch_encode(ctx, n_threads, &imgs, vec, load_image_size);
} }
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size) { bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size) {
if (!ctx->has_vision_encoder) { if (!ctx->has_vision_encoder) {
LOG_TEE("This gguf file seems to have no vision encoder\n"); LOG_TEE("This gguf file seems to have no vision encoder\n");
return false; return false;
@ -2148,8 +2159,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main // -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int pos_w = load_image_size.first/patch_size; if(load_image_size==nullptr){
int pos_h = load_image_size.second/patch_size; load_image_size= load_image_size_init();
}
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
int pos_w = load_image_size->image_size_width/patch_size;
int pos_h = load_image_size->image_size_height/patch_size;
int embed_dim = 4096; int embed_dim = 4096;
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

View File

@ -3,7 +3,6 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <utility>
#ifdef LLAMA_SHARED #ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__) # if defined(_WIN32) && !defined(__MINGW32__)
@ -27,6 +26,10 @@ extern "C" {
struct clip_ctx; struct clip_ctx;
struct load_image_size {
int image_size_width;
int image_size_height;
};
struct clip_image_u8_batch { struct clip_image_u8_batch {
struct clip_image_u8 * data; struct clip_image_u8 * data;
size_t size; size_t size;
@ -37,7 +40,7 @@ struct clip_image_f32_batch {
size_t size; size_t size;
}; };
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, std::pair<int, int> load_image_size = {448, 448}); CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity, struct load_image_size * load_image_size = nullptr);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
@ -56,6 +59,7 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API struct load_image_size * load_image_size_init();
CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_u8 * clip_image_u8_init ();
CLIP_API struct clip_image_f32 * clip_image_f32_init(); CLIP_API struct clip_image_f32 * clip_image_f32_init();
@ -76,8 +80,8 @@ CLIP_API void uhd_normalize_image_u8_to_f32(struct clip_ctx * ctx, const clip_im
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, std::pair<int, int> load_image_size = {448, 448}); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec, struct load_image_size * load_image_size = nullptr);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, std::pair<int, int> load_image_size = {448, 448}); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec, struct load_image_size * load_image_size = nullptr);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

View File

@ -31,6 +31,9 @@ struct clip_image_grid_shape {
int second; int second;
}; };
struct uhd_image_embed {
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
};
/** /**
* Selects the best resolution from a list of possible resolutions based on the original size. * Selects the best resolution from a list of possible resolutions based on the original size.
* *
@ -410,7 +413,7 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed); free(embed);
} }
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, std::pair<int, int> load_image_size) { static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos, struct load_image_size * load_image_size) {
// std::vector<clip_image_f32*> img_res_v; // std::vector<clip_image_f32*> img_res_v;
// format VectN x H x W x RGB (N x 448 x 448 x 3) // format VectN x H x W x RGB (N x 448 x 448 x 3)
clip_image_f32 * img_res_v = clip_image_f32_init(); clip_image_f32 * img_res_v = clip_image_f32_init();
@ -683,9 +686,10 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
float* image_embed = NULL; float* image_embed = NULL;
int n_image_pos = 0; int n_image_pos = 0;
int patch_size=14; int patch_size=14;
std::pair<int, int> load_image_size; struct load_image_size * load_image_size = load_image_size_init();
load_image_size.first = imgs[i][j]->nx; load_image_size->image_size_width = imgs[i][j]->nx;
load_image_size.second = imgs[i][j]->ny; load_image_size->image_size_height = imgs[i][j]->ny;
LOG_TEE("%s : %d %d\n", __func__, load_image_size->image_size_width, load_image_size->image_size_height);
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size); bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos, load_image_size);
if (!image_embed_result) { if (!image_embed_result) {
LOG_TEE("%s: coulnd't embed the image\n", __func__); LOG_TEE("%s: coulnd't embed the image\n", __func__);
@ -701,7 +705,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
return results; return results;
} }
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size) { bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) { if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n"); LOG_TEE("Unable to allocate memory for image embeddings\n");

View File

@ -18,15 +18,13 @@
#endif #endif
struct clip_ctx; struct clip_ctx;
struct uhd_image_embed;
struct uhd_image_embed {
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
};
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
struct uhd_image_embed;
struct llava_image_embed { struct llava_image_embed {
float * embed; float * embed;
int n_image_pos; int n_image_pos;
@ -47,7 +45,7 @@ LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
/** build an image embed from image file bytes */ /** build an image embed from image file bytes */
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img);
/** build an image embed from a path to an image filename */ /** build an image embed from a path to an image filename */
LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, std::pair<int, int> load_image_size = {448, 448}); LLAVA_API bool llava_image_embed_make_with_clip_img_uhd(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out, struct load_image_size * load_image_size = nullptr);
LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out); LLAVA_API bool llava_image_embed_make_with_clip_img_ollama(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed); LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed);

View File

@ -10,6 +10,10 @@
#include <cstdlib> #include <cstdlib>
#include <vector> #include <vector>
struct uhd_image_embed {
std::vector<std::vector<struct llava_image_embed *>> image_embeds;
};
static void show_additional_info(int /*argc*/, char ** argv) { static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");

View File

@ -58,7 +58,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) {
if (prompt.empty()) { if (prompt.empty()) {
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
std::pair<int, int> load_image_size = std::make_pair(448, 448); struct load_image_size * load_image_size = load_image_size_init();
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size); auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1, load_image_size);
return ctx_clip; return ctx_clip;
} }