multimodal support enabled by default

This commit is contained in:
FSSRepo 2023-10-17 16:58:20 -04:00
parent 6c277eaab5
commit ed0c11cb83

View File

@ -3,12 +3,8 @@
#include "build-info.h" #include "build-info.h"
#include "grammar-parser.h" #include "grammar-parser.h"
//#define SERVER_MULTIMODAL_SUPPORT
#ifdef SERVER_MULTIMODAL_SUPPORT
#include "../llava/clip.h" #include "../llava/clip.h"
#include "stb_image.h" #include "stb_image.h"
#endif
#ifndef NDEBUG #ifndef NDEBUG
// crash the server in debug mode, otherwise send an http 500 error // crash the server in debug mode, otherwise send an http 500 error
@ -63,7 +59,6 @@ static bool server_verbose = false;
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO(MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
#ifdef SERVER_MULTIMODAL_SUPPORT
static const std::string base64_chars = static const std::string base64_chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz"
@ -112,7 +107,6 @@ std::vector<uint8_t> base64_decode(std::string const& encoded_string) {
return ret; return ret;
} }
#endif
// parallel // parallel
enum slot_state enum slot_state
@ -267,7 +261,6 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
return out; return out;
} }
#ifdef SERVER_MULTIMODAL_SUPPORT
struct slot_image { struct slot_image {
clip_image_u8 img_data; clip_image_u8 img_data;
bool request_encode_image = false; bool request_encode_image = false;
@ -276,7 +269,6 @@ struct slot_image {
int id; int id;
std::string prefix_prompt = ""; // before of this image std::string prefix_prompt = ""; // before of this image
}; };
#endif
struct llama_client_slot struct llama_client_slot
{ {
@ -322,9 +314,8 @@ struct llama_client_slot
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
llama_grammar *grammar = nullptr; llama_grammar *grammar = nullptr;
#ifdef SERVER_MULTIMODAL_SUPPORT // multimodal
std::vector<slot_image> images; std::vector<slot_image> images;
#endif
void reset() { void reset() {
num_prompt_tokens = 0; num_prompt_tokens = 0;
@ -347,15 +338,12 @@ struct llama_client_slot
ctx_sampling.grammar = NULL; ctx_sampling.grammar = NULL;
} }
#ifdef SERVER_MULTIMODAL_SUPPORT
for(slot_image img : images) { for(slot_image img : images) {
free(img.image_embedding); free(img.image_embedding);
delete[] img.img_data.data; delete[] img.img_data.data;
img.prefix_prompt = ""; img.prefix_prompt = "";
} }
images.clear(); images.clear();
#endif
// llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? // llama_set_rng_seed(ctx, params.seed); in batched the seed matter???????
} }
@ -452,11 +440,9 @@ struct llama_server_context
std::string user_name = ""; // this should be the anti prompt std::string user_name = ""; // this should be the anti prompt
std::string assistant_name = ""; // this is for generate the prompt std::string assistant_name = ""; // this is for generate the prompt
#ifdef SERVER_MULTIMODAL_SUPPORT
bool multimodal = false; bool multimodal = false;
clip_ctx *clp_ctx = nullptr; clip_ctx *clp_ctx = nullptr;
int n_embd; int n_embd;
#endif
llama_model *model = nullptr; llama_model *model = nullptr;
llama_context *ctx = nullptr; llama_context *ctx = nullptr;
@ -490,7 +476,6 @@ struct llama_server_context
bool loadModel(const gpt_params &params_) bool loadModel(const gpt_params &params_)
{ {
params = params_; params = params_;
#ifdef SERVER_MULTIMODAL_SUPPORT
if(!params.mmproj.empty()) { if(!params.mmproj.empty()) {
multimodal = true; multimodal = true;
LOG_TEE("Multi Modal Mode Enabled"); LOG_TEE("Multi Modal Mode Enabled");
@ -504,7 +489,6 @@ struct llama_server_context
params.n_ctx = 2048; params.n_ctx = 2048;
} }
} }
#endif
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr) if (model == nullptr)
{ {
@ -512,7 +496,6 @@ struct llama_server_context
return false; return false;
} }
#ifdef SERVER_MULTIMODAL_SUPPORT
if(multimodal) { if(multimodal) {
int n_img_embd = clip_n_mmproj_embd(clp_ctx); int n_img_embd = clip_n_mmproj_embd(clp_ctx);
n_embd = llama_n_embd(model); n_embd = llama_n_embd(model);
@ -523,7 +506,6 @@ struct llama_server_context
return false; return false;
} }
} }
#endif
n_ctx = llama_n_ctx(ctx); n_ctx = llama_n_ctx(ctx);
n_vocab = llama_n_vocab(model); n_vocab = llama_n_vocab(model);
candidates.reserve(n_vocab); candidates.reserve(n_vocab);
@ -829,7 +811,6 @@ struct llama_server_context
return slot.has_next_token; // continue return slot.has_next_token; // continue
} }
#ifdef SERVER_MULTIMODAL_SUPPORT
bool processImages(llama_client_slot &slot) { bool processImages(llama_client_slot &slot) {
for(slot_image &img : slot.images) { for(slot_image &img : slot.images) {
if(!img.request_encode_image) { if(!img.request_encode_image) {
@ -914,7 +895,6 @@ struct llama_server_context
} }
return true; return true;
} }
#endif
bool updateSlots() { bool updateSlots() {
// update the system prompt wait until all slots are idle state // update the system prompt wait until all slots are idle state
@ -1088,7 +1068,6 @@ struct llama_server_context
{"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())}, {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
}); });
#ifdef SERVER_MULTIMODAL_SUPPORT
bool ingest_images = processImages(slot); // has images? bool ingest_images = processImages(slot); // has images?
// process the prefix of first image // process the prefix of first image
@ -1105,15 +1084,7 @@ struct llama_server_context
LOG_TEE("failed processing images\n"); LOG_TEE("failed processing images\n");
return false; return false;
} }
#else
for (; slot.n_past < prompt_tokens.size(); ++slot.n_past) {
batch.token [batch.n_tokens] = prompt_tokens[slot.n_past];
batch.pos [batch.n_tokens] = slot.n_past + num_tokens_system;
batch.seq_id[batch.n_tokens] = slot.id;
batch.logits[batch.n_tokens] = false;
batch.n_tokens += 1;
}
#endif
// extract the logits only for the last token // extract the logits only for the last token
if (batch.n_tokens > 0) { if (batch.n_tokens > 0) {
batch.logits[batch.n_tokens - 1] = true; batch.logits[batch.n_tokens - 1] = true;
@ -1277,9 +1248,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
#ifdef SERVER_MULTIMODAL_SUPPORT
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
#endif
printf("\n"); printf("\n");
} }
@ -1570,7 +1539,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
); );
llama.processSystemPromptData(json::parse(systm_content)); llama.processSystemPromptData(json::parse(systm_content));
} }
#ifdef SERVER_MULTIMODAL_SUPPORT
else if(arg == "--mmproj") { else if(arg == "--mmproj") {
if (++i >= argc) if (++i >= argc)
{ {
@ -1579,7 +1547,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
params.mmproj = argv[i]; params.mmproj = argv[i];
} }
#endif
else else
{ {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -1697,11 +1664,7 @@ static json format_partial_response(
{"content", content }, {"content", content },
{"stop", false}, {"stop", false},
{ "slot_id", slot->id }, { "slot_id", slot->id },
#ifdef SERVER_MULTIMODAL_SUPPORT
{"multimodal", llama.multimodal } {"multimodal", llama.multimodal }
#else
{"multimodal", false }
#endif
}; };
if (slot->sparams.n_probs > 0) if (slot->sparams.n_probs > 0)
@ -1810,8 +1773,8 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
} }
} }
} }
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama, slot)); LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama, slot));
#ifdef SERVER_MULTIMODAL_SUPPORT
if(!llama.multimodal) { if(!llama.multimodal) {
return; return;
} }
@ -1882,7 +1845,6 @@ static void parse_options_completion(const json &body, llama_client_slot* slot,
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
} }
} }
#endif
} }
static void parse_options_infill(const json &body, llama_server_context &llama, llama_client_slot *slot) static void parse_options_infill(const json &body, llama_server_context &llama, llama_client_slot *slot)