mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 20:04:35 +00:00
multimodal support enabled by default
This commit is contained in:
parent
6c277eaab5
commit
ed0c11cb83
@ -3,12 +3,8 @@
|
|||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
#include "grammar-parser.h"
|
#include "grammar-parser.h"
|
||||||
|
|
||||||
//#define SERVER_MULTIMODAL_SUPPORT
|
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
#include "../llava/clip.h"
|
#include "../llava/clip.h"
|
||||||
#include "stb_image.h"
|
#include "stb_image.h"
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
// crash the server in debug mode, otherwise send an http 500 error
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
@ -63,7 +59,6 @@ static bool server_verbose = false;
|
|||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
static const std::string base64_chars =
|
static const std::string base64_chars =
|
||||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||||
"abcdefghijklmnopqrstuvwxyz"
|
"abcdefghijklmnopqrstuvwxyz"
|
||||||
@ -112,7 +107,6 @@ std::vector<uint8_t> base64_decode(std::string const& encoded_string) {
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// parallel
|
// parallel
|
||||||
enum slot_state
|
enum slot_state
|
||||||
@ -267,7 +261,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
struct slot_image {
|
struct slot_image {
|
||||||
clip_image_u8 img_data;
|
clip_image_u8 img_data;
|
||||||
bool request_encode_image = false;
|
bool request_encode_image = false;
|
||||||
@ -276,7 +269,6 @@ struct slot_image {
|
|||||||
int id;
|
int id;
|
||||||
std::string prefix_prompt = ""; // before of this image
|
std::string prefix_prompt = ""; // before of this image
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
|
|
||||||
struct llama_client_slot
|
struct llama_client_slot
|
||||||
{
|
{
|
||||||
@ -322,9 +314,8 @@ struct llama_client_slot
|
|||||||
grammar_parser::parse_state parsed_grammar;
|
grammar_parser::parse_state parsed_grammar;
|
||||||
llama_grammar *grammar = nullptr;
|
llama_grammar *grammar = nullptr;
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
// multimodal
|
||||||
std::vector<slot_image> images;
|
std::vector<slot_image> images;
|
||||||
#endif
|
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
num_prompt_tokens = 0;
|
num_prompt_tokens = 0;
|
||||||
@ -347,15 +338,12 @@ struct llama_client_slot
|
|||||||
ctx_sampling.grammar = NULL;
|
ctx_sampling.grammar = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
for(slot_image img : images) {
|
for(slot_image img : images) {
|
||||||
free(img.image_embedding);
|
free(img.image_embedding);
|
||||||
delete[] img.img_data.data;
|
delete[] img.img_data.data;
|
||||||
img.prefix_prompt = "";
|
img.prefix_prompt = "";
|
||||||
}
|
}
|
||||||
images.clear();
|
images.clear();
|
||||||
#endif
|
|
||||||
|
|
||||||
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
|
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -452,11 +440,9 @@ struct llama_server_context
|
|||||||
std::string user_name = ""; // this should be the anti prompt
|
std::string user_name = ""; // this should be the anti prompt
|
||||||
std::string assistant_name = ""; // this is for generate the prompt
|
std::string assistant_name = ""; // this is for generate the prompt
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
bool multimodal = false;
|
bool multimodal = false;
|
||||||
clip_ctx *clp_ctx = nullptr;
|
clip_ctx *clp_ctx = nullptr;
|
||||||
int n_embd;
|
int n_embd;
|
||||||
#endif
|
|
||||||
|
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
@ -490,7 +476,6 @@ struct llama_server_context
|
|||||||
bool loadModel(const gpt_params ¶ms_)
|
bool loadModel(const gpt_params ¶ms_)
|
||||||
{
|
{
|
||||||
params = params_;
|
params = params_;
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
if(!params.mmproj.empty()) {
|
if(!params.mmproj.empty()) {
|
||||||
multimodal = true;
|
multimodal = true;
|
||||||
LOG_TEE("Multi Modal Mode Enabled");
|
LOG_TEE("Multi Modal Mode Enabled");
|
||||||
@ -504,7 +489,6 @@ struct llama_server_context
|
|||||||
params.n_ctx = 2048;
|
params.n_ctx = 2048;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (model == nullptr)
|
if (model == nullptr)
|
||||||
{
|
{
|
||||||
@ -512,7 +496,6 @@ struct llama_server_context
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
if(multimodal) {
|
if(multimodal) {
|
||||||
int n_img_embd = clip_n_mmproj_embd(clp_ctx);
|
int n_img_embd = clip_n_mmproj_embd(clp_ctx);
|
||||||
n_embd = llama_n_embd(model);
|
n_embd = llama_n_embd(model);
|
||||||
@ -523,7 +506,6 @@ struct llama_server_context
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
n_vocab = llama_n_vocab(model);
|
n_vocab = llama_n_vocab(model);
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
@ -829,7 +811,6 @@ struct llama_server_context
|
|||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
bool processImages(llama_client_slot &slot) {
|
bool processImages(llama_client_slot &slot) {
|
||||||
for(slot_image &img : slot.images) {
|
for(slot_image &img : slot.images) {
|
||||||
if(!img.request_encode_image) {
|
if(!img.request_encode_image) {
|
||||||
@ -914,7 +895,6 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
bool updateSlots() {
|
bool updateSlots() {
|
||||||
// update the system prompt wait until all slots are idle state
|
// update the system prompt wait until all slots are idle state
|
||||||
@ -1088,7 +1068,6 @@ struct llama_server_context
|
|||||||
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
|
||||||
});
|
});
|
||||||
|
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
bool ingest_images = processImages(slot); // has images?
|
bool ingest_images = processImages(slot); // has images?
|
||||||
|
|
||||||
// process the prefix of first image
|
// process the prefix of first image
|
||||||
@ -1105,15 +1084,7 @@ struct llama_server_context
|
|||||||
LOG_TEE("failed processing images\n");
|
LOG_TEE("failed processing images\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
for (; slot.n_past < prompt_tokens.size(); ++slot.n_past) {
|
|
||||||
batch.token [batch.n_tokens] = prompt_tokens[slot.n_past];
|
|
||||||
batch.pos [batch.n_tokens] = slot.n_past + num_tokens_system;
|
|
||||||
batch.seq_id[batch.n_tokens] = slot.id;
|
|
||||||
batch.logits[batch.n_tokens] = false;
|
|
||||||
batch.n_tokens += 1;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
if (batch.n_tokens > 0) {
|
if (batch.n_tokens > 0) {
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
@ -1277,9 +1248,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||||
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
printf(" -spf FNAME, --system-prompt-file FNAME\n");
|
||||||
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
#endif
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1570,7 +1539,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
);
|
);
|
||||||
llama.processSystemPromptData(json::parse(systm_content));
|
llama.processSystemPromptData(json::parse(systm_content));
|
||||||
}
|
}
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
else if(arg == "--mmproj") {
|
else if(arg == "--mmproj") {
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
{
|
{
|
||||||
@ -1579,7 +1547,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
}
|
}
|
||||||
params.mmproj = argv[i];
|
params.mmproj = argv[i];
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
@ -1697,11 +1664,7 @@ static json format_partial_response(
|
|||||||
{"content", content },
|
{"content", content },
|
||||||
{"stop", false},
|
{"stop", false},
|
||||||
{ "slot_id", slot->id },
|
{ "slot_id", slot->id },
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
{"multimodal", llama.multimodal }
|
{"multimodal", llama.multimodal }
|
||||||
#else
|
|
||||||
{"multimodal", false }
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot->sparams.n_probs > 0)
|
if (slot->sparams.n_probs > 0)
|
||||||
@ -1810,8 +1773,8 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama, slot));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama, slot));
|
||||||
#ifdef SERVER_MULTIMODAL_SUPPORT
|
|
||||||
if(!llama.multimodal) {
|
if(!llama.multimodal) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -1882,7 +1845,6 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
|
|||||||
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
|
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parse_options_infill(const json &body, llama_server_context &llama, llama_client_slot *slot)
|
static void parse_options_infill(const json &body, llama_server_context &llama, llama_client_slot *slot)
|
||||||
|
Loading…
Reference in New Issue
Block a user