Compare commits

...

4 Commits

Author SHA1 Message Date
Andrei
7683cfc26c
Merge 951f1d9053 into 1b28061400 2024-09-11 12:40:07 -04:00
slaren
1b28061400
llama : skip token bounds check when evaluating embeddings (#9437)
Some checks are pending
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
2024-09-11 17:52:13 +02:00
Andrei Betlen
951f1d9053 Merge remote-tracking branch 'origin' into add-support-for-phi3-vision 2024-08-27 18:13:54 -04:00
Andrei Betlen
dc0625ab8f Add support for Phi3-vision-instruct 2024-08-27 18:11:41 -04:00
2 changed files with 159 additions and 14 deletions

View File

@ -132,6 +132,8 @@ static std::string format(const char * fmt, ...) {
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_SUB_GN "v.sub_gn"
#define TN_GLB_GN "v.glb_gn"
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
#define TN_MINICPMV_QUERY "resampler.query"
@ -530,6 +532,9 @@ struct clip_vision_model {
struct ggml_tensor * mm_model_ln_kv_b;
struct ggml_tensor * mm_model_ln_post_w;
struct ggml_tensor * mm_model_ln_post_b;
struct ggml_tensor * sub_gn;
struct ggml_tensor * glb_gn;
};
struct clip_ctx {
@ -777,6 +782,138 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
// print_tensor_info(embeddings, "embeddings");
// phi-3.5-vision-instruct
if (model.sub_gn && model.glb_gn) {
// Phi3VisionEmbedding.hd_transform()
ggml_tensor * x = embeddings;
int num_images = batch_size;
int h_crop = 1, w_crop = 1;
int C = x->ne[0];
int L = x->ne[1];
int N = x->ne[2];
int H = (int)sqrt((float)L);
GGML_ASSERT(H * H == L);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
// Phi3ImageEmbedding.reshape_hd_patches_2x2merge()
x = ggml_reshape_4d(ctx0, x, N, H, H, C);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 0, 1, 2));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 3, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 2, H / 2, 2, H / 2 * C * N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 1, 3, 2));
x = ggml_reshape_3d(ctx0, x, N * C * (H / 2), (H / 2), 4);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4, H / 2, H / 2, N * C);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4, (H / 2) * (H / 2), C, N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 3, 1, 2));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4 * C, H / 2, H / 2, N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, (H / 2) * 4 * C, (H / 2), w_crop, num_images * h_crop);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4 * C, w_crop * (H / 2), h_crop * (H / 2), num_images);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
ggml_tensor * global_image_features_hd = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
// Phi3ImageEmbedding.add_image_newline()
ggml_tensor * newline_embedding = model.sub_gn;
for (int i = 0; i < H/2-1; i++) {
newline_embedding = ggml_concat(ctx0, newline_embedding, model.sub_gn, 2);
}
ggml_tensor * global_image_features_hd_newline = ggml_concat(ctx0, global_image_features_hd, newline_embedding, 1);
global_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, global_image_features_hd_newline, 3, 2, 1, 0));
global_image_features_hd_newline = ggml_reshape_4d(ctx0, global_image_features_hd_newline, 1, 1, (w_crop*(H/2)+1) * h_crop*(H/2), 4*C);
global_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, global_image_features_hd_newline, 3, 2, 1, 0));
h_crop = image_size / 336;
w_crop = image_size / 336;
// sub_image_features_hd
x = embeddings;
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
// Phi3ImageEmbedding.reshape_hd_patches_2x2merge()
x = ggml_reshape_4d(ctx0, x, N, H, H, C);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 0, 1, 2));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 2, 3, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 2, H / 2, 2, H / 2 * C * N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 1, 3, 2));
x = ggml_reshape_3d(ctx0, x, N * C * (H / 2), (H / 2), 4);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4, H / 2, H / 2, N * C);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4, (H / 2) * (H / 2), C, N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 3, 1, 2));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4 * C, H / 2, H / 2, N);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, (H / 2) * 4 * C, (H / 2), w_crop, num_images * h_crop);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 1, 3));
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
x = ggml_reshape_4d(ctx0, x, 4 * C, w_crop * (H / 2), h_crop * (H / 2), num_images);
x = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
ggml_tensor * sub_image_features_hd = ggml_cont(ctx0, ggml_permute(ctx0, x, 3, 2, 1, 0));
// Phi3ImageEmbedding.add_image_newline()
newline_embedding = model.sub_gn;
for (int i = 0; i < (H/2-1); i++) {
newline_embedding = ggml_concat(ctx0, newline_embedding, model.sub_gn, 2);
}
ggml_tensor * sub_image_features_hd_newline = ggml_concat(ctx0, sub_image_features_hd, newline_embedding, 1);
sub_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, sub_image_features_hd_newline, 3, 2, 1, 0));
sub_image_features_hd_newline = ggml_reshape_4d(ctx0, sub_image_features_hd_newline, 1, 1, (w_crop*(H/2)+1) * h_crop*(H/2), 4*C);
sub_image_features_hd_newline = ggml_cont(ctx0, ggml_permute(ctx0, sub_image_features_hd_newline, 3, 2, 1, 0));
embeddings = ggml_concat(ctx0, sub_image_features_hd_newline, model.glb_gn, 1);
embeddings = ggml_concat(ctx0, embeddings, global_image_features_hd_newline, 1);
}
// llava projector
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@ -1402,6 +1539,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
} catch (std::runtime_error & /*e*/) { }
try {
vision_model.sub_gn = get_tensor(new_clip->ctx_data, TN_SUB_GN);
vision_model.glb_gn = get_tensor(new_clip->ctx_data, TN_GLB_GN);
} catch (std::runtime_error & /*e*/) { }
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projection
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));

View File

@ -16076,19 +16076,21 @@ static int llama_decode_internal(
return -1;
}
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
if (batch_all.token) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
return -1;
}
}
}
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@ -16375,19 +16377,21 @@ static int llama_encode_internal(
return -1;
}
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
if (batch.token) {
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
return -1;
}
}
}
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");