Merge 951f1d9053 into 1b28061400

llama : skip token bounds check when evaluating embeddings (#9437 )
Merge remote-tracking branch 'origin' into add-support-for-phi3-vision
2024-09-22 21:16:20 +00:00 · 2024-09-11 12:40:07 -04:00 · 2024-09-11 17:52:13 +02:00 · 2024-08-27 18:13:54 -04:00 · 2024-08-27 18:11:41 -04:00
2 changed files with 159 additions and 14 deletions
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -132,6 +132,8 @@ static std::string format(const char * fmt, ...) {
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
+#define TN_SUB_GN          "v.sub_gn"
+#define TN_GLB_GN          "v.glb_gn"

 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
 #define TN_MINICPMV_QUERY "resampler.query"
@ -530,6 +532,9 @@ struct clip_vision_model {
    struct ggml_tensor * mm_model_ln_kv_b;
    struct ggml_tensor * mm_model_ln_post_w;
    struct ggml_tensor * mm_model_ln_post_b;
+
+    struct ggml_tensor * sub_gn;
+    struct ggml_tensor * glb_gn;
 };

 struct clip_ctx {
@ -777,6 +782,138 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

        // print_tensor_info(embeddings, "embeddings");

+        // phi-3.5-vision-instruct
+        if (model.sub_gn && model.glb_gn) {
+            // Phi3VisionEmbedding.hd_transform()
+            ggml_tensor * x = embeddings;
+
+            int num_images = batch_size;
+            int h_crop = 1, w_crop = 1;
+
+            int C = x->ne[0];
+            int L = x->ne[1];
+            int N = x->ne[2];
+
+            int H = (int)sqrt((float)L);
+
+            GGML_ASSERT(H * H == L);
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            // Phi3ImageEmbedding.reshape_hd_patches_2x2merge()
+            x = ggml_reshape_4d(ctx0, x, N, H, H, C);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 0, 1, 2));
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 3, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 2, H / 2, 2, H / 2 * C * N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 1, 3, 2));
+            x = ggml_reshape_3d(ctx0, x, N * C * (H / 2), (H / 2), 4);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4, H / 2, H / 2, N * C);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4, (H / 2) * (H / 2), C, N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 3, 1, 2));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4 * C, H / 2, H / 2, N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, (H / 2) * 4 * C, (H / 2), w_crop, num_images * h_crop);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4 * C, w_crop * (H / 2), h_crop * (H / 2), num_images);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            ggml_tensor * global_image_features_hd = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            // Phi3ImageEmbedding.add_image_newline()
+            ggml_tensor * newline_embedding = model.sub_gn;
+            for (int i = 0; i < H/2-1; i++) {
+                newline_embedding = ggml_concat(ctx0, newline_embedding, model.sub_gn, 2);
+            }
+            ggml_tensor * global_image_features_hd_newline = ggml_concat(ctx0, global_image_features_hd, newline_embedding, 1);
+
+            global_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, global_image_features_hd_newline, 3, 2, 1, 0));
+            global_image_features_hd_newline = ggml_reshape_4d(ctx0, global_image_features_hd_newline, 1, 1, (w_crop*(H/2)+1) * h_crop*(H/2), 4*C);
+            global_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, global_image_features_hd_newline, 3, 2, 1, 0));
+
+            h_crop = image_size / 336;
+            w_crop = image_size / 336;
+
+            // sub_image_features_hd
+            x = embeddings;
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            // Phi3ImageEmbedding.reshape_hd_patches_2x2merge()
+            x = ggml_reshape_4d(ctx0, x, N, H, H, C);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 0, 1, 2));
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 3, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 2, H / 2, 2, H / 2 * C * N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 1, 3, 2));
+            x = ggml_reshape_3d(ctx0, x, N * C * (H / 2), (H / 2), 4);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4, H / 2, H / 2, N * C);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4, (H / 2) * (H / 2), C, N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 3, 1, 2));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4 * C, H / 2, H / 2, N);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, (H / 2) * 4 * C, (H / 2), w_crop, num_images * h_crop);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
+
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+            x = ggml_reshape_4d(ctx0, x, 4 * C, w_crop * (H / 2), h_crop * (H / 2), num_images);
+            x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            ggml_tensor * sub_image_features_hd = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
+
+            // Phi3ImageEmbedding.add_image_newline()
+            newline_embedding = model.sub_gn;
+            for (int i = 0; i < (H/2-1); i++) {
+                newline_embedding = ggml_concat(ctx0, newline_embedding, model.sub_gn, 2);
+            }
+            ggml_tensor * sub_image_features_hd_newline = ggml_concat(ctx0, sub_image_features_hd, newline_embedding, 1);
+
+            sub_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, sub_image_features_hd_newline, 3, 2, 1, 0));
+            sub_image_features_hd_newline = ggml_reshape_4d(ctx0, sub_image_features_hd_newline, 1, 1, (w_crop*(H/2)+1) * h_crop*(H/2), 4*C);
+            sub_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, sub_image_features_hd_newline, 3, 2, 1, 0));
+
+            embeddings = ggml_concat(ctx0, sub_image_features_hd_newline, model.glb_gn, 1);
+            embeddings = ggml_concat(ctx0, embeddings, global_image_features_hd_newline, 1);
+        }
+
        // llava projector
        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@ -1402,6 +1539,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & /*e*/) { }
+            try {
+                vision_model.sub_gn = get_tensor(new_clip->ctx_data, TN_SUB_GN);
+                vision_model.glb_gn = get_tensor(new_clip->ctx_data, TN_GLB_GN);
+            } catch (std::runtime_error & /*e*/) { }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
            vision_model.mm_model_mlp_1_w               = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -16076,19 +16076,21 @@ static int llama_decode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT

+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
+
    GGML_ASSERT(n_tokens_all <= cparams.n_batch);

    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16375,19 +16377,21 @@ static int llama_encode_internal(
        return -1;
    }

-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-            return -1;
-        }
-    }
-
    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
Author	SHA1	Message	Date
Andrei	7683cfc26c	Merge `951f1d9053` into `1b28061400`	2024-09-11 12:40:07 -04:00
slaren	1b28061400	llama : skip token bounds check when evaluating embeddings (#9437 ) Some checks are pending Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run Details Nix CI / nix-eval (macos-latest) (push) Waiting to run Details Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run Details Nix CI / nix-build (macos-latest) (push) Waiting to run Details Nix CI / nix-build (ubuntu-latest) (push) Waiting to run Details Python check requirements.txt / check-requirements (push) Waiting to run Details flake8 Lint / Lint (push) Waiting to run Details Python Type-Check / pyright type-check (push) Waiting to run Details	2024-09-11 17:52:13 +02:00
Andrei Betlen	951f1d9053	Merge remote-tracking branch 'origin' into add-support-for-phi3-vision	2024-08-27 18:13:54 -04:00
Andrei Betlen	dc0625ab8f	Add support for Phi3-vision-instruct	2024-08-27 18:11:41 -04:00