feat: support internvl

2024-11-11 13:30:35 +00:00 · 2024-09-10 14:39:00 +08:00 · 2024-09-10 14:39:00 +08:00 · 1e8646b3e8
commit 1e8646b3e8
parent bfe76d4a17
17 changed files with 3122 additions and 0 deletions
--- a/examples/internvl/CMakeLists.txt
+++ b/examples/internvl/CMakeLists.txt
@ -0,0 +1,38 @@
+add_library(internvl OBJECT
+            internvl.cpp
+            internvl.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(internvl PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(internvl PUBLIC .)
+target_include_directories(internvl PUBLIC ../..)
+target_include_directories(internvl PUBLIC ../../common)
+
+target_compile_features(internvl PRIVATE cxx_std_11)
+
+add_library(internvl_static STATIC $<TARGET_OBJECTS:internvl>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(internvl PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(internvl PRIVATE INTERNVL_SHARED INTERNVL_BUILD)
+    add_library(internvl_shared SHARED $<TARGET_OBJECTS:internvl>)
+    target_link_libraries(internvl_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS internvl_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(internvl PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(internvl BUILD_INFO)
+endif()
+
+set(TARGET llama-internvl-cli)
+add_executable(${TARGET} internvl-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-internvl-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common internvl ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/internvl/README.md
+++ b/examples/internvl/README.md
@ -0,0 +1,172 @@
+# InternVL
+
+Currently this implementation supports [Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5).
+
+## Usage
+Build with cmake or run `make llama-internvl-cli` to build it.
+
+After building, run: `./llama-internvl-cli` to see the usage. For example:
+
+```sh
+./llama-internvl-cli -m InternVL-gguf/internlm2-1.8B-chat-q4_k.gguf --mmproj InternVL-gguf/InternViT-300M-448px-f16.gguf --image path/to/an/image.jpg -p "<image>\nPlease describe the image shortly."
+```
+
+## Model conversation
+
+1. Clone `Mini-InternVL-Chat-2B-V1-5` locally:
+
+```sh
+git clone https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5
+```
+
+2. Copy `config.json` from [internlm2-chat-1_8b](https://huggingface.co/internlm/internlm2-chat-1_8b):
+   
+
+3. Use `split_language_tensors.py` to get LLaMA constituents:
+
+```sh
+mkdir adjusted-internlm-chat
+python split_language_tensors.py -m path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors -o path/to/adjusted-internlm-chat/model.safetensors
+```
+
+4. Prepare the essentials for converting language model:
+```sh
+cp path/to/Mini-InternVL-Chat-2B-V1-5/*.json path/to/adjusted-internlm-chat/
+cp path/to/Mini-InternVL-Chat-2B-V1-5/tokenizer.model path/to/adjusted-internlm-chat/
+cp path/to/internlm2-chat-1_8b/config.json path/to/adjusted-internlm-chat/
+```
+
+5. Use `convert_hf_to_gguf.py` to convert the language model to GGUF:
+
+```sh
+python convert_hf_to_gguf.py path/to/adjusted-internlm-chat/
+```
+
+6. Use `vision_model_to_gguf.py` to convert the image encoder to GGUF:
+
+```sh
+python vision_model_to_gguf.py path/to/Mini-InternVL-Chat-2B-V1-5/model.safetensors
+```
+
+7. Collect and rename the models:
+```sh
+mkdir InternVL-gguf
+mv Mini-InternVL-Chat-2B-V1-5/model.safetensors-f16.gguf InternVL-gguf/InternViT-300M-448px-f16.gguf
+mv adjusted-internlm-chat/adjusted-internlm-1.9B-chat-F16.gguf InternVL-gguf/internlm2-1.8B-chat-F16.gguf
+```
+
+8. Use `llama-quantize` to convert the language model from `fp16` to `q4_k`
+```sh
+./llama-quantize path/to/InternVL-gguf/internlm2-1.8B-chat-F16.gguf path/to/InternVL-gguf/internlm2-1.8B-chat-q4_k.gguf q4_k_s
+```
+
+## Some result on Android with `Snapdragon 888+` chip
+### case 1
+**input**
+```sh
+/data/local/tmp/llama-internvl-cli \
+    -m /data/local/tmp/internlm2-1.8B-chat-q4_k.gguf \
+    --mmproj /data/local/tmp/InternViT-300M-448px-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/image1.jpg \
+    -p "<image>\nPlease describe the image shortly." \
+    -b 4096 -c 4096
+```
+
+**output**
+```sh
+encode_image_with_clip: image embedding created: 1792 tokens
+
+encode_image_with_clip: image encoded in 164683.39 ms by CLIP (   91.90 ms per image patch)
+The image shows a young red panda peeking over the top of a wooden platform or platform-like structure, with its head sticking out over the edge. The panda has a striking red coat with white patches around its eyes and ears. Its fur looks fluffy and it has a black nose and mouth. The background is green and blurry, suggesting it might be an outdoor setting, possibly a zoo or a sanctuary. The wood on the platform looks worn and worn, indicating it might be well used.
+llama_print_timings:        load time =  316889.60 ms
+llama_print_timings:      sample time =       7.27 ms /   103 runs   (    0.07 ms per token, 14173.66 tokens per second)
+llama_print_timings: prompt eval time =  151858.76 ms /  1831 tokens (   82.94 ms per token,    12.06 tokens per second)
+llama_print_timings:        eval time =   19437.72 ms /   102 runs   (  190.57 ms per token,     5.25 tokens per second)
+llama_print_timings:       total time =  336547.70 ms /  1933 tokens
+```
+
+### case2
+**input**
+```sh
+/data/local/tmp/llama-internvl-cli \
+    -m /data/local/tmp/internlm2-1.8B-chat-q4_k.gguf \
+    --mmproj /data/local/tmp/InternViT-300M-448px-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "<image>\nWho is the author of this book? \nAnswer the question using a single word or phrase."
+```
+
+**output**
+```sh
+encode_image_with_clip: image embedding created: 768 tokens
+
+encode_image_with_clip: image encoded in 87791.64 ms by CLIP (  114.31 ms per image patch)
+Susan Wise Bauer
+llama_print_timings:        load time =  144433.03 ms
+llama_print_timings:      sample time =       0.51 ms /     6 runs   (    0.08 ms per token, 11834.32 tokens per second)
+llama_print_timings: prompt eval time =   55674.58 ms /   820 tokens (   67.90 ms per token,    14.73 tokens per second)
+llama_print_timings:        eval time =     581.98 ms /     5 runs   (  116.40 ms per token,     8.59 tokens per second)
+llama_print_timings:       total time =  145118.73 ms /   825 tokens
+```
+
+## Running on Nvidia 4090
+### case1
+
+**input**
+```sh
+bin/llama-internvl-cli \
+    -m path/to/internlm2-1.8B-chat-q4_k.gguf \
+    --mmproj path/to/InternViT-300M-448px-f16.gguf \
+    -t 4 --image path/to/image1.jpg \
+    -p "<image>\nPlease describe the image shortly." \
+    --gpu-layers 1000 -b 4096 -c 4096
+```
+
+**output**
+```sh
+encode_image_with_clip: image embedding created: 1792 tokens
+
+encode_image_with_clip: image encoded in   278.86 ms by CLIP (    0.16 ms per image patch)
+
+The image depicts a red panda, a small, wild bear found primarily in the mountains of central and south-eastern China, and the surrounding areas of India and Nepal. This species is distinguished by its distinctive red fur with a white face and legs, which is a unique feature that helps them blend in with their natural habitat of mountainous regions. The red panda is known for its thick fur, which is typically a blend of red, black, and white fur, with a thick tail and large ears, which aid in thermoregulation.
+
+In the image, the red panda is leaning over a wooden platform, which appears to be part of a man-made enclosure, likely within a zoo or wildlife park. The platform is made of wood and seems to be constructed to provide the animals with a place to rest or interact with visitors. The background features trees with green foliage, indicating that the setting is an outdoor environment with ample vegetation, which is typical for red panda habitats.
+
+The red panda’s front paws are resting on the platform, while its head is slightly tilted, giving an impression that it is engaged with something in front of it, possibly a camera, a person, or an object placed on the platform. Its eyes are large and dark, which are characteristic of the species, and it has a slightly wrinkled face, typical of many bear species, which helps them stay warm in cold temperatures. The expression on the panda’s face appears curious or attentive, as it looks directly at the camera or observer.
+
+In summary, the image showcases a red panda in an outdoor setting with a wooden platform, surrounded by green trees. The animal appears to be relaxed and engaged, likely interacting with the observer or something placed on the platform. The setting suggests that the panda is in a zoo or a wildlife sanctuary, where it is cared for and protected from the wild.
+
+llama_print_timings:        load time =     723.77 ms
+llama_print_timings:      sample time =      14.28 ms /   392 runs   (    0.04 ms per token, 27443.29 tokens per second)
+llama_print_timings: prompt eval time =     107.43 ms /  1831 tokens (    0.06 ms per token, 17043.81 tokens per second)
+llama_print_timings:        eval time =    1184.80 ms /   391 runs   (    3.03 ms per token,   330.01 tokens per second)
+llama_print_timings:       total time =    1942.12 ms /  2222 tokens
+```
+
+### case2
+**input**
+```sh
+/data/local/tmp/llama-internvl-cli \
+    -m /data/local/tmp/internlm2-1.8B-chat-q4_k.gguf \
+    --mmproj /data/local/tmp/InternViT-300M-448px-f16.gguf \
+    -t 4 \
+    --image /data/local/tmp/demo.jpg \
+    -p "<image>\nWho is the author of this book? \nAnswer the question using a single word or phrase." \
+    --gpu-layers 1000
+```
+
+**output**
+```sh
+encode_image_with_clip: image embedding created: 768 tokens
+
+encode_image_with_clip: image encoded in   138.85 ms by CLIP (    0.18 ms per image patch)
+
+Susan Wise Bauer
+
+llama_print_timings:        load time =     430.77 ms
+llama_print_timings:      sample time =       0.21 ms /     6 runs   (    0.03 ms per token, 28571.43 tokens per second)
+llama_print_timings: prompt eval time =      70.31 ms /   820 tokens (    0.09 ms per token, 11661.97 tokens per second)
+llama_print_timings:        eval time =      15.84 ms /     5 runs   (    3.17 ms per token,   315.68 tokens per second)
+llama_print_timings:       total time =     446.85 ms /   825 tokens
+```
--- a/examples/internvl/android/adb_run.sh
+++ b/examples/internvl/android/adb_run.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+model_dir="/home/qianlangyu/model/InternVL-gguf"
+projector_name="InternViT-300M-448px-f16.gguf"
+# projector_name="InternViT-300M-448px-q4_k.gguf"
+# llama_name="internlm2-1.8B-chat-F16.gguf"
+llama_name="internlm2-1.8B-chat-q4_k.gguf"
+img_dir="/home/qianlangyu/resource/imgs"
+img_name="image1.jpg"
+prompt="<image>\nPlease describe the image shortly."
+# img_name="cat.jpeg"
+# prompt="<image>\nWhat is in the image?"
+# img_name="demo.jpg"
+# prompt="<image>\nWho is the author of this book? \nAnswer the question using a single word or phrase."
+
+program_dir="build/bin"
+binName="llama-internvl-cli"
+n_threads=4
+
+
+deviceDir="/data/local/tmp"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function android_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} LD_LIBRARY_PATH=/data/local/tmp ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}_1.txt"
+    adb shell "cd ${deviceDir}; pwd; LD_LIBRARY_PATH=/data/local/tmp ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}_1.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}_1.txt ${saveDir}
+}
+
+android_run
+
+echo "android_run is Done!"
--- a/examples/internvl/android/build_64.sh
+++ b/examples/internvl/android/build_64.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_PLATFORM=android-23 $1
+
+make -j4
--- a/examples/internvl/clip.cpp
+++ b/examples/internvl/clip.cpp
--- a/examples/internvl/clip.h
+++ b/examples/internvl/clip.h
@ -0,0 +1,96 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define CLIP_API __declspec(dllexport)
+#        else
+#            define CLIP_API __declspec(dllimport)
+#        endif
+#    else
+#        define CLIP_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define CLIP_API
+#endif
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_vision_hparams {
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t n_head;
+    int32_t n_layer;
+    float scale_factor;
+    int32_t out_dim;
+    int32_t batch_size = 0;
+    float eps;
+
+    char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
+
+    int32_t image_grid_pinpoints[72];
+    // int32_t image_crop_resolution;
+};
+
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+
+CLIP_API void clip_free(struct clip_ctx * ctx);
+
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
+
+CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
+CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+
+CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+
+/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
+CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, clip_image_f32_batch & res_imgs);
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+// CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
--- a/examples/internvl/internvl-cli.cpp
+++ b/examples/internvl/internvl-cli.cpp
@ -0,0 +1,358 @@
+#include "ggml.h"
+#include "common.h"
+#include "clip.h"
+#include "internvl.h"
+#include "llama.h"
+
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+
+    // printf("prompt token ids: ");
+    // for (int i = 0; i < (int) embd_inp.size(); i++) {
+    //     printf("%d ", embd_inp[i]);
+    // }
+    // printf("\n");
+
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+static const char * sample(struct llama_sampling_context * ctx_sampling,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    static std::string ret;
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char* IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static internvl_image_embed * internvl_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t img_base64_str_start, img_base64_str_end;
+    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
+        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        return NULL;
+    }
+
+    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = internvl_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct internvl_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct internvl_image_embed * load_image(internvl_context * ctx_internvl, gpt_params * params, const std::string & fname) {
+
+    // load and preprocess the image
+    internvl_image_embed * embed = NULL;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
+        }
+        embed = internvl_image_embed_make_with_prompt_base64(ctx_internvl->ctx_clip, params->n_threads, prompt);
+        if (!embed) {
+            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            return NULL;
+        }
+        params->prompt = remove_image_from_prompt(prompt);
+    } else {
+        embed = internvl_image_embed_make_with_filename(ctx_internvl->ctx_clip, params->n_threads, fname.c_str());
+        if (!embed) {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+            return NULL;
+        }
+    }
+
+    return embed;
+}
+
+// prompt token ids = [user_id, tokens_id, assistant_id]
+// total embedding = concat(img_embedding, tokens_id_embedding)
+static void process_prompt(struct internvl_context * ctx_internvl, struct internvl_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_internvl->ctx_llama));
+
+    // llava chat format is "'<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n<image>\n请描述图片.<|im_end|><|im_start|>assistant\n'"
+    std::size_t img_tok_pos = prompt.find("<image>");
+    std::string prompt1;
+    std::string prompt2;
+
+    if (img_tok_pos != std::string::npos) {
+        prompt1 = prompt.substr(0, img_tok_pos);
+        prompt2 = prompt.substr(img_tok_pos + 7);
+    }
+    else {
+        prompt1 = "";
+        prompt2 = "\n" + prompt;
+    }
+    
+    eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
+    // eval_string(ctx_internvl->ctx_llama, ("<|im_start|>system\nYou are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user\n" + prompt1 + "<img>").c_str(), params->n_batch, &n_past, true);
+    internvl_eval_image_embed(ctx_internvl->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_internvl->ctx_llama, ("</img>" + prompt2 + "<|im_end|><|im_start|>assistant\n").c_str(), params->n_batch, &n_past, false);
+    // generate the response
+
+    fprintf(stderr, "\n");
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+
+    if (params->n_predict == -1) {
+        while (true) {
+            const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past);
+            if (strcmp(tmp, "</s>") == 0 || strcmp(tmp, "<|im_end|>") == 0)
+                break;
+            printf("%s", tmp);
+            fflush(stdout);
+        }
+    } else {
+        for (int i = 0; i < max_tgt_len; i++) {
+            const char *tmp = sample(ctx_sampling, ctx_internvl->ctx_llama, &n_past);
+            if (strcmp(tmp, "</s>") == 0 || strcmp(tmp, "<|im_end|>") == 0)
+                break;
+            printf("%s", tmp);
+            fflush(stdout);
+        }
+    }
+
+    llama_sampling_free(ctx_sampling);
+    printf("\n");
+    }
+
+static struct llama_model * internvl_init(gpt_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return NULL;
+    }
+    
+    return model;
+}
+
+static struct llama_context * llama_init_context(gpt_params * params, llama_model * model) {
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    return ctx_llama;
+}
+
+static struct internvl_context * internvl_init_context(gpt_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    // load visual model
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    auto ctx_internvl = (struct internvl_context *)malloc(sizeof(internvl_context));
+
+    ctx_internvl->ctx_llama = NULL;
+    ctx_internvl->ctx_clip = ctx_clip;
+    ctx_internvl->model = model;
+    return ctx_internvl;
+}
+
+static void internvl_free(struct internvl_context * ctx_internvl) {
+    if (ctx_internvl->ctx_clip) {
+        clip_free(ctx_internvl->ctx_clip);
+        ctx_internvl->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_internvl->ctx_llama);
+    llama_free_model(ctx_internvl->model);
+    llama_backend_free();
+}
+
+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
+    }
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        print_usage(argc, argv, params);
+        return 1;
+    }
+    // printf("[debug by cxt] use prompt: %s\n", params.prompt.c_str());
+    // printf("[debug by cxt] concat_image_text_embedding: %d\n", params.concat_image_text_embedding);
+    // printf("[debug by cxt] bench_perf: %d\n", params.bench_perf);
+
+    auto model = internvl_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init internvl\n", __func__);
+        return 1;
+    }
+
+    // auto prompt_embed = load_adaptor(ctx_internvl, params.prompt.c_str(), false);
+    // printf("%s: prompt context:%s, token size: %d\n", __func__, params.prompt.c_str(), prompt_embed->n_token);
+    // for (int i=0; i<prompt_embed->n_token; i++) {
+    //     int col_num=5;
+    //     printf("[%d,:%d]: ", i, col_num);
+    //     for (int j=0; j<col_num; j++) {
+    //         printf("%f ", prompt_embed->embed[i*4096 + j]);
+    //     }
+    //     printf("  [%d,-%d:]: ", i, col_num);
+    //     for (int j=0; j<col_num; j++) {
+    //         printf("%f ", prompt_embed->embed[i*4096 + 4096 - col_num + j]);
+    //     }
+    //     printf("\n");
+    // }
+    // auto ctx_llama = llama_init_context(&params, model);
+
+    auto ctx_internvl = internvl_init_context(&params, model);
+    ctx_internvl->ctx_llama = llama_init_context(&params, model);
+    for (auto & image : params.image) {
+        for (int i=0; i<15; i++) {
+
+        ctx_internvl->ctx_llama = llama_init_context(&params, model);
+        // // clear kv cache
+        // llama_kv_cache_clear(ctx_internvl->ctx_llama);
+
+        const int64_t t_e2e_start_us = ggml_time_us();
+        auto image_embed = load_image(ctx_internvl, &params, image);
+        if (!image_embed) {
+            std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+            return 1;
+        }
+
+        // process the prompt
+        process_prompt(ctx_internvl, image_embed, &params, params.prompt);
+
+        const int64_t t_e2e_end_us = ggml_time_us();
+        float t_e2e_cost_us = (t_e2e_end_us - t_e2e_start_us) / 1000.0;
+        LOG_TEE("\n%s: %d e2e in %8.2f ms\n", __func__, i, t_e2e_cost_us);
+
+        llama_print_timings(ctx_internvl->ctx_llama);
+
+        // internvl_adaptor_embed_free(prompt_embed);
+
+        internvl_image_embed_free(image_embed);
+        // ctx_internvl->model = NULL;
+        // internvl_free(ctx_internvl);
+
+        }
+    }
+
+    llama_free_model(model);
+    
+    return 0;
+}
--- a/examples/internvl/internvl.cpp
+++ b/examples/internvl/internvl.cpp
@ -0,0 +1,375 @@
+#include "clip.h"
+#include "ggml.h"
+#include "common.h"
+#include "llama.h"
+#include "internvl.h"
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <numeric>
+
+#include <cstdarg>
+#include <climits>
+
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+struct clip_image_grid_shape {
+    int first;
+    int second;
+};
+
+
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), buf.size());
+}
+
+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    std::string result;
+    for (size_t pos = 0; ; pos += search.length()) {
+        auto new_pos = s.find(search, pos);
+        if (new_pos == std::string::npos) {
+            result += s.substr(pos, s.size() - pos);
+            break;
+        }
+        result += s.substr(pos, new_pos - pos) + replace;
+        pos = new_pos;
+    }
+    s = std::move(result);
+}
+
+static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
+static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
+    size_t tensor_size = ggml_nbytes(tensor);
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
+            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
+}
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width  = original_size.first;
+    int original_height = original_size.second;
+
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width  = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector:
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    const int64_t t_img_process_start_us = ggml_time_us();
+
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    const int64_t t_img_preprocess_start_us = ggml_time_us();
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
+        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+        delete[] img_res_v.data;
+        return false;
+    }
+
+    const int64_t t_img_process_end_us = ggml_time_us();
+    float t_img_process_cost_us = (t_img_process_end_us - t_img_process_start_us) / 1000.0;
+    LOG_TEE("\n%s: image process in %8.2f ms\n", __func__, t_img_process_cost_us);
+
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+
+    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+    // flat / default internvl-1.5 type embedding
+    *n_img_pos = clip_n_patches(ctx_clip);
+    bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
+    delete[] img_res_v.data;
+    if (!encoded) {
+        fprintf(stderr, "Unable to encode image\n");
+
+        return false;
+    }
+
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+
+    const int64_t t_img_enc_end_us = ggml_time_us();
+    float t_img_preprocess_ms = (t_img_enc_start_us - t_img_preprocess_start_us) / 1000.0;
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+    LOG_TEE("\n%s: image preprocessed in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_preprocess_ms, t_img_preprocess_ms / *n_img_pos);
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+
+    return true;
+}
+
+bool internvl_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
+        // make sure that the correct mmproj was used, i.e., compare apples to apples
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
+    if (n_image_embd != n_llama_embd) {
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        return false;
+    }
+    return true;
+}
+
+bool internvl_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+    // allocate memory for out embeddings, default batch_size is 6, buff_size = 268 * 4096 * sizeof(float) * 6
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/internvl model
+    if (!image_embd) {
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
+bool internvl_eval_image_embed(llama_context * ctx_llama, const struct internvl_image_embed * image_embed, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
+        int n_eval = image_embed->n_image_pos - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+struct internvl_image_embed * internvl_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+    
+    const int64_t t_img_dec_start_us = ggml_time_us();
+
+    clip_image_u8 * img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+        clip_image_u8_free(img);
+        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+
+    const int64_t t_img_dec_end_us = ggml_time_us();
+    float t_img_dec_ms = (t_img_dec_end_us - t_img_dec_start_us) / 1000.0;
+    LOG_TEE("\n%s: image encoded in %8.2f ms\n", __func__, t_img_dec_ms);
+
+    float* image_embed = NULL;
+    int n_image_pos = 0;
+    bool image_embed_result = internvl_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
+    if (!image_embed_result) {
+        clip_image_u8_free(img);
+        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
+        return NULL;
+    }
+
+    clip_image_u8_free(img);
+    auto result = (internvl_image_embed*)malloc(sizeof(internvl_image_embed));
+    result->embed = image_embed;
+    result->n_image_pos = n_image_pos;
+    return result;
+}
+
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
+    auto file = fopen(path, "rb");
+    if (file == NULL) {
+        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
+    if (buffer == NULL) {
+        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    errno = 0;
+    size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
+    if (ferror(file)) {
+        die_fmt("read error: %s", strerror(errno));
+    }
+    if (ret != (size_t) fileSize) {
+        die("unexpectedly reached end of file");
+    }
+    fclose(file); // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+struct internvl_image_embed * internvl_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+    unsigned char* image_bytes;
+    long image_bytes_length;
+
+    const int64_t t_img_load_start_us = ggml_time_us();
+
+    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded) {
+        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    const int64_t t_img_load_end_us = ggml_time_us();
+    float t_img_load_ms = (t_img_load_end_us - t_img_load_start_us) / 1000.0;
+    LOG_TEE("\n%s: image loaded in %8.2f ms\n", __func__, t_img_load_ms);
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+
+    internvl_image_embed *embed = internvl_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+
+    const int64_t t_img_enc_end_us = ggml_time_us();
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+    LOG_TEE("\n%s: image encoded in %8.2f ms\n", __func__, t_img_enc_ms);
+
+
+    free(image_bytes);
+
+    return embed;
+}
+
+void internvl_image_embed_free(struct internvl_image_embed * embed) {
+    free(embed->embed);
+    free(embed);
+}
--- a/examples/internvl/internvl.h
+++ b/examples/internvl/internvl.h
@ -0,0 +1,51 @@
+#ifndef INTERNVL_H
+#define INTERNVL_H
+
+#include "ggml.h"
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define INTERNVL_API __declspec(dllexport)
+#        else
+#            define INTERNVL_API __declspec(dllimport)
+#        endif
+#    else
+#        define INTERNVL_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define INTERNVL_API
+#endif
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct internvl_image_embed {
+    float * embed;
+    int n_image_pos;
+};
+
+/** sanity check for clip <-> internvl embed size match */
+INTERNVL_API bool internvl_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+
+INTERNVL_API bool internvl_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
+/** build an image embed from image file bytes */
+INTERNVL_API struct internvl_image_embed * internvl_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+/** build an image embed from a path to an image filename */
+INTERNVL_API struct internvl_image_embed * internvl_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+INTERNVL_API void internvl_image_embed_free(struct internvl_image_embed * embed);
+/** free an embedding made with internvl_image_embed_make_* */
+
+/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+INTERNVL_API bool internvl_eval_image_embed(struct llama_context * ctx_llama, const struct internvl_image_embed * embed, int n_batch, int * n_past);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/internvl/rk-3588/adb_run.sh
+++ b/examples/internvl/rk-3588/adb_run.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+
+model_dir="/home/qianlangyu/model/dms-single-img"
+projector_name="model.safetensors-q4_0.gguf"
+# projector_name="InternViT-300M-448px-q4_k.gguf"
+# llama_name="internlm2-1.8B-chat-F16.gguf"
+llama_name="Dms-Single-Img-630M-q4_0.gguf"
+img_dir="/home/qianlangyu/model/checkpoint-2000-merged/images/"
+img_name="baixiancui_cloud_agent_1707907703002_37313.jpg"
+prompt="<image>\n<dms>"
+# img_name="cat.jpeg"
+# prompt="<image>\nWhat is in the image?"
+# img_name="demo.jpg"
+# prompt="<image>\nWho is the author of this book? \nAnswer the question using a single word or phrase."
+
+program_dir="build/bin"
+binName="llama-internvl-cli"
+n_threads=4
+
+
+deviceDir="/data/qianlangyu/dms"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function rk_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} LD_LIBRARY_PATH=/data/qianlangyu/dms/lib ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 -b 512 -c 512 \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; LD_LIBRARY_PATH=/data/qianlangyu/dms/lib ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 -b 512 -c 512 \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+rk_run
+
+echo "rk_run is Done!"
--- a/examples/internvl/rk-3588/build_64.sh
+++ b/examples/internvl/rk-3588/build_64.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=/home/qianlangyu/software/rk-3588/aarch64-linux-gnu.toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release $1
+
+make -j4
--- a/examples/internvl/run_gpu.sh
+++ b/examples/internvl/run_gpu.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 quant_type[fp16 or q4_k]"
+    exit 1
+fi
+quant_type=$1
+
+if [ "$quant_type" != "fp16" ] && [ "$quant_type" != "q4_k" ]; then
+    echo "Usage: $0 quant_type[fp16 or q4_k]"
+    exit 1
+fi
+
+SCRIPT_PATH=$(dirname "$(readlink -f "$0")")
+ROOT_PATH=${SCRIPT_PATH}/../../
+
+resource_root=/home/chenxiaotao03/model/llama.cpp/internvl-chat-2b-v1-5
+
+llm_model_name=internlm2-1.8B-chat-q4_k.gguf
+if [ "$quant_type" == "fp16" ]; then
+    llm_model_name=internlm2-1.8B-chat-F16.gguf
+fi
+
+${ROOT_PATH}/build/bin/llama-internvl-cli \
+    -m ${resource_root}/${llm_model_name} \
+    --mmproj ${resource_root}/InternViT-300M-448px-f16.gguf \
+    -t 4 \
+    --image ${resource_root}/image1.jpg \
+    -p "<image>\n请详细描述图片" \
+    --gpu-layers 1000 \
+    -b 4096 -c 4096 \
+    -fa
+
--- a/examples/internvl/rv-1126/adb_run.sh
+++ b/examples/internvl/rv-1126/adb_run.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+model_dir="/home/qianlangyu/model/dms-single-img"
+projector_name="model.safetensors-q4_0.gguf"
+# projector_name="InternViT-300M-448px-q4_k.gguf"
+# llama_name="internlm2-1.8B-chat-F16.gguf"
+llama_name="Dms-Single-Img-630M-q4_0.gguf"
+img_dir="/home/qianlangyu/model/checkpoint-2000-merged/images/"
+img_name="baixiancui_cloud_agent_1707907703002_37313.jpg"
+prompt="<image>\n<dms>"
+# img_name="cat.jpeg"
+# prompt="<image>\nWhat is in the image?"
+# img_name="demo.jpg"
+# prompt="<image>\nWho is the author of this book? \nAnswer the question using a single word or phrase."
+
+program_dir="build/bin"
+binName="llama-internvl-cli"
+n_threads=4
+
+
+deviceDir="/userdata/media/qianlangyu/dms"
+saveDir="output"
+if [ ! -d ${saveDir} ]; then
+    mkdir ${saveDir}
+fi
+
+
+function rv_run() {
+    # # copy resource into device
+    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
+    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
+    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
+    # copy program into device
+    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
+    adb shell "chmod 0777 ${deviceDir}/${binName}"
+
+    # run
+    adb shell "echo cd ${deviceDir} LD_LIBRARY_PATH=/userdata/media/qianlangyu/dms/lib ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 > ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt"
+    adb shell "cd ${deviceDir}; pwd; LD_LIBRARY_PATH=/userdata/media/qianlangyu/dms/lib ${deviceDir}/${binName} \
+                                                 -m ${deviceDir}/${llama_name} \
+                                                 --mmproj ${deviceDir}/${projector_name} \
+                                                 -t ${n_threads} \
+                                                 --image ${deviceDir}/${img_name} \
+                                                 -p \"${prompt}\" \
+                                                 >> ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt 2>&1"
+    adb pull ${deviceDir}/${modelName}_${projector_name}_${llama_name}_${n_threads}_${img_name}.txt ${saveDir}
+}
+
+rv_run
+
+echo "rv_run is Done!"
--- a/examples/internvl/rv-1126/build_64.sh
+++ b/examples/internvl/rv-1126/build_64.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+cmake ../../../../ \
+-DCMAKE_TOOLCHAIN_FILE=/home/qianlangyu/resource/toolchain/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf/rv11xx_toolchain.cmake \
+-DCMAKE_BUILD_TYPE=Release \
+-DGGML_NATIVE=OFF \
+
+make -j4
--- a/examples/internvl/split_language_tensors.py
+++ b/examples/internvl/split_language_tensors.py
@ -0,0 +1,27 @@
+import argparse
+import os
+
+import torch
+from safetensors.torch import load_file, save_file
+import numpy as np
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-path", help=".pth model path", required=True)
+ap.add_argument("-o", "--output-path", help="Path to save language model. Default is the original model directory", default=None)
+
+args = ap.parse_args()
+model_path = args.model_path
+model = load_file(model_path)
+dir_model = os.path.dirname(model_path)
+output_path = args.output_path if args.output_path is not None else os.path.join(dir_model, "language_model.safetensors")
+
+# print(os.path.getsize("language_model.safetensors"))
+
+language_tensors = {}
+for name, data in model.items():
+    print(f"Name: {name}, data: {data.shape}, dtype: {data.dtype}")
+    if name.find("language_model.") != -1:
+        language_tensors[name.replace("language_model.", "")] = data
+
+save_file(language_tensors, output_path)
+
--- a/examples/internvl/vision_model_to_gguf.py
+++ b/examples/internvl/vision_model_to_gguf.py
@ -0,0 +1,121 @@
+import argparse
+import os
+
+import torch
+from safetensors.torch import load_file
+import numpy as np
+from gguf import *
+
+VISION = "clip.vision"
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+def get_tensor_name(name: str) -> str:
+
+    return name.replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("norm", "ln")
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-path", help=".pth model path", required=True)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+
+# with proper
+args = ap.parse_args()
+
+model_path = args.model_path
+model_name = os.path.basename(model_path).replace(".pth", "")
+dir_model = os.path.dirname(model_path)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+# load the model
+if model_path.endswith(".pth"):
+    model = torch.load(model_path, map_location=torch.device('cpu'))
+else:
+    # model = GGUFReader(model_path)
+    # tensors = model.tensors
+    model = load_file(model_path)
+
+# for t in tensors:
+    # print(f"Name: {t.name}, data: {t.shape}, dtype: {t.tensor_type}")
+# for name, data in model.items():
+    # print(f"Name: {name}, data: {data.shape}, dtype: {data.dtype}")
+# exit(0)
+
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+# os.makedirs(output_dir, exist_ok=True)
+output_prefix = os.path.basename(output_dir).replace("ggml_", "")
+fname_out = os.path.join(output_dir, f"{model_name}-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch=model_name)
+
+fout.add_file_type(ftype)
+fout.add_name(model_name)
+
+fout.add_description("Vision Transformer model")
+
+with open(os.path.join(dir_model, "config.json"), "r", encoding="utf-8") as config_file:
+    config = json.load(config_file)
+    hparams = config["vision_config"]
+
+fout.add_uint32("clip.vision.image_size", hparams["image_size"])
+fout.add_uint32("clip.vision.patch_size", hparams["patch_size"])
+fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), hparams["hidden_size"])
+fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), hparams["intermediate_size"])
+# fout.add_uint32("clip.vision.projection_dim", hparams.get("projection_dim", config["projection_dim"]))
+fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), hparams["num_attention_heads"])
+fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), hparams["layer_norm_eps"])
+block_count = hparams["num_hidden_layers"]
+fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+with open(os.path.join(dir_model, "preprocessor_config.json"), "r", encoding="utf-8") as f:
+    preprocessor_config = json.load(f)
+
+image_mean = preprocessor_config["image_mean"]
+image_std = preprocessor_config["image_std"]
+
+fout.add_array("clip.vision.image_mean", image_mean)
+fout.add_array("clip.vision.image_std", image_std)
+
+for name, data in model.items():
+    if name.find('language_model') != -1:
+        continue
+    name = get_tensor_name(name)
+    data = data.float().numpy()
+    # pw and dw conv ndim==4
+    if (data.ndim == 2 or data.ndim == 4) and ftype == 1:
+        data = data.astype(np.float16)
+    # split in weight/bias into q,k,v
+    if ".attn.qkv" in name:
+        # [1024*3, 1024] -> 3*[1024, 1024]
+        print(f"Splitting {name} with shape {data.shape}")
+        if data.shape[0] == 1024*3:
+            data = data.reshape(3, 1024, -1)
+            qkv = [data[0].squeeze(), data[1].squeeze(), data[2].squeeze()]
+        elif data.shape[0] == 1024*3:
+            qkv = np.split(data, 3, axis=0)
+        else:
+            raise ValueError(f"Unknown shape {data.shape}")
+        
+
+        print(f"{name} shape {data.shape} split into {len(qkv)} shape: {qkv[0].shape}, {qkv[1].shape}, {qkv[2].shape}")
+        fout.add_tensor(name.replace(".attn.qkv", ".attn.q"), qkv[0])
+        fout.add_tensor(name.replace(".attn.qkv", ".attn.k"), qkv[1])
+        fout.add_tensor(name.replace(".attn.qkv", ".attn.v"), qkv[2])
+    else:
+        fout.add_tensor(name, data)
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -1113,6 +1113,9 @@ if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
                # Android armeabi-v7a
                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7-a")
+                # rv-11xx
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
            else()
                # Raspberry Pi 2
                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)