mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
Merge branch 'master' into server-rev
This commit is contained in:
commit
176993c871
9
Makefile
9
Makefile
@ -1,7 +1,7 @@
|
||||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = \
|
||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||
simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search \
|
||||
simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search \
|
||||
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
@ -608,13 +608,6 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
||||
|
||||
$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
|
||||
|
||||
|
||||
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
|
@ -962,7 +962,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
|
||||
|
||||
- [main](./examples/main/README.md)
|
||||
- [server](./examples/server/README.md)
|
||||
- [embd-input](./examples/embd-input/README.md)
|
||||
- [jeopardy](./examples/jeopardy/README.md)
|
||||
- [BLIS](./docs/BLIS.md)
|
||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||
|
@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
const std::string arg_prefix = "--";
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.repeat_last_n = std::stoi(argv[i]);
|
||||
sparams.penalty_last_n = std::stoi(argv[i]);
|
||||
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
|
||||
} else if (arg == "--repeat-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.repeat_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_repeat = std::stof(argv[i]);
|
||||
} else if (arg == "--frequency-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.frequency_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_freq = std::stof(argv[i]);
|
||||
} else if (arg == "--presence-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.presence_penalty = std::stof(argv[i]);
|
||||
sparams.penalty_present = std::stof(argv[i]);
|
||||
} else if (arg == "--mirostat") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.grammar = argv[i];
|
||||
sparams.grammar = argv[i];
|
||||
} else if (arg == "--grammar-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
std::copy(
|
||||
std::istreambuf_iterator<char>(file),
|
||||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(params.grammar)
|
||||
std::back_inserter(sparams.grammar)
|
||||
);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
@ -640,7 +641,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
|
||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
const llama_sampling_params & sparams = params.sampling_params;
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
@ -678,10 +679,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
|
||||
printf(" --mirostat N use Mirostat sampling.\n");
|
||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||
@ -878,7 +879,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||
params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
||||
}
|
||||
|
||||
{
|
||||
@ -1123,28 +1124,28 @@ std::string get_sortable_timestamp() {
|
||||
|
||||
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||
const llama_sampling_params & sparams = params.sampling_params;
|
||||
const llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
||||
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
|
||||
|
||||
#ifdef NDEBUG
|
||||
fprintf(stream, "debug: false\n");
|
||||
@ -1178,8 +1179,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
||||
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
||||
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
|
||||
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
||||
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
||||
@ -1238,14 +1239,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
||||
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
||||
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
||||
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
||||
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
||||
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
||||
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
||||
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
|
||||
|
||||
fprintf(stream, "reverse_prompt:\n");
|
||||
for (std::string ap : params.antiprompt) {
|
||||
|
@ -56,7 +56,7 @@ struct gpt_params {
|
||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sampling_params;
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
@ -66,7 +66,6 @@ struct gpt_params {
|
||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
|
||||
|
@ -399,7 +399,7 @@ namespace grammar_parser {
|
||||
void print_grammar(FILE * file, const parse_state & state) {
|
||||
try {
|
||||
std::map<uint32_t, std::string> symbol_id_names;
|
||||
for (auto kv : state.symbol_ids) {
|
||||
for (const auto & kv : state.symbol_ids) {
|
||||
symbol_id_names[kv.second] = kv.first;
|
||||
}
|
||||
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
|
||||
|
@ -1,9 +1,9 @@
|
||||
#include "sampling.h"
|
||||
|
||||
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
|
||||
struct llama_sampling_context * result = new llama_sampling_context();
|
||||
|
||||
result->params = params.sampling_params;
|
||||
result->params = params;
|
||||
result->grammar = nullptr;
|
||||
|
||||
// if there is a grammar, parse it
|
||||
@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa
|
||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
result->prev.resize(params.n_ctx);
|
||||
result->prev.resize(params.n_prev);
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
|
||||
dst->prev = src->prev;
|
||||
}
|
||||
|
||||
llama_token llama_sampling_last(llama_sampling_context * ctx) {
|
||||
return ctx->prev.back();
|
||||
}
|
||||
|
||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
|
||||
const int size = ctx_sampling->prev.size();
|
||||
|
||||
n = std::min(n, size);
|
||||
|
||||
std::string result;
|
||||
|
||||
for (int i = size - n; i < size; i++) {
|
||||
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string llama_sampling_print(const llama_sampling_params & params) {
|
||||
char result[1024];
|
||||
|
||||
snprintf(result, sizeof(result),
|
||||
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
|
||||
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
|
||||
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
|
||||
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
|
||||
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
|
||||
params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||
|
||||
return std::string(result);
|
||||
}
|
||||
|
||||
llama_token llama_sampling_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
struct llama_context * ctx_cfg,
|
||||
const int idx) {
|
||||
const int n_ctx = llama_n_ctx(ctx_main);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const llama_sampling_params & params = ctx_sampling->params;
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||
const float penalty_repeat = params.penalty_repeat;
|
||||
const float penalty_freq = params.penalty_freq;
|
||||
const float penalty_present = params.penalty_present;
|
||||
const int mirostat = params.mirostat;
|
||||
const float mirostat_tau = params.mirostat_tau;
|
||||
const float mirostat_eta = params.mirostat_eta;
|
||||
@ -97,7 +128,7 @@ llama_token llama_sampling_sample(
|
||||
|
||||
float * logits = llama_get_logits_ith(ctx_main, idx);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
// apply params.logit_bias map
|
||||
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
@ -117,14 +148,10 @@ llama_token llama_sampling_sample(
|
||||
// apply penalties
|
||||
if (!prev.empty()) {
|
||||
const float nl_logit = logits[llama_token_nl(ctx_main)];
|
||||
const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
|
||||
|
||||
llama_sample_repetition_penalty(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - last_n_repeat,
|
||||
last_n_repeat, alpha_frequency, alpha_presence);
|
||||
llama_sample_repetition_penalties(ctx_main, &cur_p,
|
||||
prev.data() + prev.size() - penalty_last_n,
|
||||
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
|
||||
|
||||
if (!penalize_nl) {
|
||||
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
||||
@ -141,7 +168,7 @@ llama_token llama_sampling_sample(
|
||||
}
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
// greedy sampling
|
||||
id = llama_sample_token_greedy(ctx_main, &cur_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
@ -152,8 +179,9 @@ llama_token llama_sampling_sample(
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
// temperature sampling
|
||||
size_t min_keep = std::max(1, params.n_probs);
|
||||
|
||||
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||
@ -183,11 +211,12 @@ llama_token llama_sampling_sample(
|
||||
void llama_sampling_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id) {
|
||||
llama_token id,
|
||||
bool apply_grammar) {
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(id);
|
||||
|
||||
if (ctx_sampling->grammar != NULL) {
|
||||
if (ctx_sampling->grammar != NULL && apply_grammar) {
|
||||
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
|
||||
}
|
||||
}
|
||||
|
@ -10,30 +10,30 @@
|
||||
|
||||
// sampling parameters
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
// Classifier-Free Guidance
|
||||
// https://arxiv.org/abs/2306.17806
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // how strong is guidance
|
||||
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
|
||||
} llama_sampling_params;
|
||||
|
||||
// general sampler context
|
||||
@ -58,7 +58,7 @@ struct llama_sampling_context {
|
||||
#include "common.h"
|
||||
|
||||
// Create a new sampling context instance.
|
||||
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
|
||||
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
|
||||
|
||||
void llama_sampling_free(struct llama_sampling_context * ctx);
|
||||
|
||||
@ -70,6 +70,15 @@ void llama_sampling_reset(llama_sampling_context * ctx);
|
||||
// Copy the sampler context
|
||||
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
|
||||
|
||||
// Get the last sampled token
|
||||
llama_token llama_sampling_last(llama_sampling_context * ctx);
|
||||
|
||||
// Get a string representation of the last sampled tokens
|
||||
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
|
||||
|
||||
// Print sampling parameters into a string
|
||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
@ -96,4 +105,5 @@ llama_token llama_sampling_sample(
|
||||
void llama_sampling_accept(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
llama_token id);
|
||||
llama_token id,
|
||||
bool apply_grammar);
|
||||
|
@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
|
||||
|
||||
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
|
||||
if (impr_plot > 0) impr_plot = 0;
|
||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
|
||||
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
|
||||
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
|
||||
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
|
||||
*sched, opt->loss_after);
|
||||
|
@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
|
||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
||||
help="output format - use 0 for float32, 1 for float16",
|
||||
)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
return parser.parse_args()
|
||||
|
||||
args = parse_args()
|
||||
@ -86,6 +87,11 @@ if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
||||
print(f"gguf: Conversion Endianess {endianess}")
|
||||
# possible tensor data types
|
||||
# ftype == 0 -> float32
|
||||
# ftype == 1 -> float16
|
||||
@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
|
||||
num_parts = count_model_parts(dir_model)
|
||||
print(f"num_parts:{num_parts}\n")
|
||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
|
@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
|
||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
||||
hparams = json.load(f)
|
||||
|
||||
if hparams["architectures"][0] != "FalconForCausalLM":
|
||||
if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
|
||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
||||
|
||||
sys.exit(1)
|
||||
@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
|
||||
print("gguf: get model metadata")
|
||||
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
block_count = hparams.get("num_hidden_layers")
|
||||
if block_count is None:
|
||||
block_count = hparams["n_layer"] # old name
|
||||
|
||||
n_head = hparams.get("num_attention_heads")
|
||||
if n_head is None:
|
||||
n_head = hparams["n_head"] # old name
|
||||
|
||||
n_head_kv = hparams.get("num_kv_heads")
|
||||
if n_head_kv is None:
|
||||
n_head_kv = hparams.get("n_head_kv", 1) # old name
|
||||
|
||||
gguf_writer.add_name("Falcon")
|
||||
gguf_writer.add_context_length(2048) # not in config.json
|
||||
@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
|
||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||
if "num_kv_heads" in hparams:
|
||||
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
|
||||
else:
|
||||
gguf_writer.add_head_count_kv(1)
|
||||
gguf_writer.add_head_count(n_head)
|
||||
gguf_writer.add_head_count_kv(n_head_kv)
|
||||
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
||||
gguf_writer.add_file_type(ftype)
|
||||
|
||||
@ -152,10 +159,6 @@ special_vocab.add_to_gguf(gguf_writer)
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
||||
|
||||
# params for qkv transform
|
||||
n_head = hparams["num_attention_heads"]
|
||||
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
|
||||
|
||||
head_dim = hparams["hidden_size"] // n_head
|
||||
|
||||
# tensor info
|
||||
|
20
convert.py
20
convert.py
@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||
|
||||
|
||||
class OutputFile:
|
||||
def __init__(self, fname_out: Path) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
name = "LLaMA"
|
||||
@ -875,10 +875,10 @@ class OutputFile:
|
||||
self.gguf.close()
|
||||
|
||||
@staticmethod
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
|
||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
@ -903,10 +903,10 @@ class OutputFile:
|
||||
return dt.quantize(arr)
|
||||
|
||||
@staticmethod
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
|
||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
|
||||
check_vocab_size(params, vocab)
|
||||
|
||||
of = OutputFile(fname_out)
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
|
||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||
args = parser.parse_args(args_in)
|
||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
do_dump_model(model_plus)
|
||||
@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
endianess = gguf.GGUFEndian.LITTLE
|
||||
if args.bigendian:
|
||||
endianess = gguf.GGUFEndian.BIG
|
||||
|
||||
params = Params.load(model_plus)
|
||||
if params.n_ctx == -1:
|
||||
@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
params.ftype = ftype
|
||||
print(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
||||
print(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(baby-llama)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(batched)
|
||||
add_subdirectory(batched-bench)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(embd-input)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(beam-search)
|
||||
add_subdirectory(benchmark)
|
||||
add_subdirectory(convert-llama2c-to-ggml)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(finetune)
|
||||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(save-load-state)
|
||||
add_subdirectory(simple)
|
||||
add_subdirectory(speculative)
|
||||
add_subdirectory(train-text-from-scratch)
|
||||
if (LLAMA_METAL)
|
||||
add_subdirectory(metal)
|
||||
endif()
|
||||
|
@ -11,12 +11,16 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
|
||||
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN]\n" , argv[0]);
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
// number of parallel batches
|
||||
int n_parallel = 1;
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
int n_len = 32;
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
@ -29,13 +33,14 @@ int main(int argc, char ** argv) {
|
||||
n_parallel = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
n_len = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (params.prompt.empty()) {
|
||||
params.prompt = "Hello my name is";
|
||||
}
|
||||
|
||||
// total length of the sequences including the prompt
|
||||
const int n_len = 32;
|
||||
|
||||
// init LLM
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
|
||||
if (file.size < 4) {
|
||||
return false;
|
||||
}
|
||||
uint32_t magic = file.read_u32();
|
||||
std::string magic = file.read_string(4);
|
||||
return magic == GGUF_MAGIC;
|
||||
}
|
||||
|
||||
|
4
examples/embd-input/.gitignore
vendored
4
examples/embd-input/.gitignore
vendored
@ -1,4 +0,0 @@
|
||||
PandaGPT
|
||||
MiniGPT-4
|
||||
*.pth
|
||||
|
@ -1,17 +0,0 @@
|
||||
set(TARGET embdinput)
|
||||
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
|
||||
install(TARGETS ${TARGET} LIBRARY)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
set(TARGET embd-input-test)
|
||||
add_executable(${TARGET} embd-input-test.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
@ -1,63 +0,0 @@
|
||||
### Examples for input embedding directly
|
||||
|
||||
## Requirement
|
||||
build `libembdinput.so`
|
||||
run the following comman in main dir (../../).
|
||||
```
|
||||
make
|
||||
```
|
||||
|
||||
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
|
||||
|
||||
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
|
||||
2. Convert it to ggml format.
|
||||
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
|
||||
|
||||
```
|
||||
import torch
|
||||
|
||||
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
|
||||
pth_path = "./examples/embd-input/llava_projection.pth"
|
||||
|
||||
dic = torch.load(bin_path)
|
||||
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
|
||||
torch.save({k: dic[k] for k in used_key}, pth_path)
|
||||
```
|
||||
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
|
||||
|
||||
|
||||
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
|
||||
|
||||
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
|
||||
The `adapter_config.json` is
|
||||
```
|
||||
{
|
||||
"peft_type": "LORA",
|
||||
"fan_in_fan_out": false,
|
||||
"bias": null,
|
||||
"modules_to_save": null,
|
||||
"r": 32,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
|
||||
}
|
||||
```
|
||||
2. Papare the `vicuna` v0 model.
|
||||
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
|
||||
4. Clone the PandaGPT source.
|
||||
```
|
||||
git clone https://github.com/yxuansu/PandaGPT
|
||||
```
|
||||
5. Install the requirement of PandaGPT.
|
||||
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
|
||||
|
||||
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
|
||||
|
||||
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
|
||||
2. Clone the MiniGPT-4 source.
|
||||
```
|
||||
git clone https://github.com/Vision-CAIR/MiniGPT-4/
|
||||
```
|
||||
3. Install the requirement of PandaGPT.
|
||||
4. Papare the `vicuna` v0 model.
|
||||
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.
|
@ -1,221 +0,0 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "embd-input.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static llama_context ** g_ctx;
|
||||
|
||||
extern "C" {
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
|
||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||
params.seed = uint32_t(time(NULL));
|
||||
}
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
struct MyModel * ret = new MyModel();
|
||||
ret->ctx = ctx;
|
||||
ret->params = params;
|
||||
ret->n_past = 0;
|
||||
// printf("ctx: %d\n", ret->ctx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void free_mymodel(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
delete mymodel;
|
||||
}
|
||||
|
||||
|
||||
bool eval_float(void * model, float * input, int N){
|
||||
MyModel * mymodel = (MyModel*)model;
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_emb = llama_n_embd(llama_get_model(ctx));
|
||||
int n_past = mymodel->n_past;
|
||||
int n_batch = N; // params.n_batch;
|
||||
|
||||
for (int i = 0; i < (int) N; i += n_batch) {
|
||||
int n_eval = (int) N - i;
|
||||
if (n_eval > n_batch) {
|
||||
n_eval = n_batch;
|
||||
}
|
||||
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
||||
if (llama_decode(ctx, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
|
||||
MyModel * mymodel = (MyModel* )model;
|
||||
llama_context * ctx;
|
||||
ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
int n_past = mymodel->n_past;
|
||||
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
|
||||
int n_eval = (int) tokens.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
mymodel->n_past = n_past;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool eval_id(struct MyModel* mymodel, int id) {
|
||||
std::vector<llama_token> tokens;
|
||||
tokens.push_back(id);
|
||||
return eval_tokens(mymodel, tokens);
|
||||
}
|
||||
|
||||
bool eval_string(struct MyModel * mymodel,const char* str){
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
std::string str2 = str;
|
||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
|
||||
eval_tokens(mymodel, embd_inp);
|
||||
return true;
|
||||
}
|
||||
|
||||
llama_token sampling_id(struct MyModel* mymodel) {
|
||||
llama_context* ctx = mymodel->ctx;
|
||||
gpt_params params = mymodel->params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
// int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
// const float repeat_penalty = params.repeat_penalty;
|
||||
// const float alpha_presence = params.presence_penalty;
|
||||
// const float alpha_frequency = params.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = params.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
} else {
|
||||
if (mirostat == 1) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
const int mirostat_m = 100;
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
||||
} else if (mirostat == 2) {
|
||||
static float mirostat_mu = 2.0f * mirostat_tau;
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
||||
} else {
|
||||
// Temperature sampling
|
||||
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
|
||||
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
|
||||
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||
llama_sample_temp(ctx, &candidates_p, temp);
|
||||
id = llama_sample_token(ctx, &candidates_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
const char * sampling(struct MyModel * mymodel) {
|
||||
llama_context * ctx = mymodel->ctx;
|
||||
int id = sampling_id(mymodel);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(ctx)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx, id);
|
||||
}
|
||||
eval_id(mymodel, id);
|
||||
return ret.c_str();
|
||||
}
|
||||
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
#include "embd-input.h"
|
||||
#include <stdlib.h>
|
||||
#include <random>
|
||||
#include <string.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
auto mymodel = create_mymodel(argc, argv);
|
||||
int N = 10;
|
||||
int max_tgt_len = 500;
|
||||
int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
|
||||
|
||||
// add random float embd to test evaluation
|
||||
float * data = new float[N*n_embd];
|
||||
std::default_random_engine e;
|
||||
std::uniform_real_distribution<float> u(0,1);
|
||||
for (int i=0;i<N*n_embd;i++) {
|
||||
data[i] = u(e);
|
||||
}
|
||||
|
||||
eval_string(mymodel, "user: what is the color of the flag of UN?");
|
||||
eval_float(mymodel, data, N);
|
||||
eval_string(mymodel, "assistant:");
|
||||
eval_string(mymodel, mymodel->params.prompt.c_str());
|
||||
const char* tmp;
|
||||
for (int i=0; i<max_tgt_len; i++) {
|
||||
tmp = sampling(mymodel);
|
||||
if (strcmp(tmp, "</s>")==0) break;
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
}
|
||||
printf("\n");
|
||||
free_mymodel(mymodel);
|
||||
return 0;
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
#ifndef _EMBD_INPUT_H_
|
||||
#define _EMBD_INPUT_H_ 1
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
typedef struct MyModel {
|
||||
llama_context* ctx;
|
||||
gpt_params params;
|
||||
int n_past = 0;
|
||||
} MyModel;
|
||||
|
||||
struct MyModel* create_mymodel(int argc, char ** argv);
|
||||
|
||||
bool eval_float(void* model, float* input, int N);
|
||||
bool eval_tokens(void* model, std::vector<llama_token> tokens);
|
||||
bool eval_id(struct MyModel* mymodel, int id);
|
||||
bool eval_string(struct MyModel* mymodel, const char* str);
|
||||
const char * sampling(struct MyModel* mymodel);
|
||||
llama_token sampling_id(struct MyModel* mymodel);
|
||||
void free_mymodel(struct MyModel* mymodel);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -1,72 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import ctypes
|
||||
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
libc = cdll.LoadLibrary("./libembdinput.so")
|
||||
libc.sampling.restype=c_char_p
|
||||
libc.create_mymodel.restype=c_void_p
|
||||
libc.eval_string.argtypes=[c_void_p, c_char_p]
|
||||
libc.sampling.argtypes=[c_void_p]
|
||||
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
|
||||
|
||||
|
||||
class MyModel:
|
||||
def __init__(self, args):
|
||||
argc = len(args)
|
||||
c_str = [c_char_p(i.encode()) for i in args]
|
||||
args_c = (c_char_p * argc)(*c_str)
|
||||
self.model = c_void_p(libc.create_mymodel(argc, args_c))
|
||||
self.max_tgt_len = 512
|
||||
self.print_string_eval = True
|
||||
|
||||
def __del__(self):
|
||||
libc.free_mymodel(self.model)
|
||||
|
||||
def eval_float(self, x):
|
||||
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
|
||||
|
||||
def eval_string(self, x):
|
||||
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
|
||||
if self.print_string_eval:
|
||||
print(x)
|
||||
|
||||
def eval_token(self, x):
|
||||
libc.eval_id(self.model, x)
|
||||
|
||||
def sampling(self):
|
||||
s = libc.sampling(self.model)
|
||||
return s
|
||||
|
||||
def stream_generate(self, end="</s>"):
|
||||
ret = b""
|
||||
end = end.encode()
|
||||
for _ in range(self.max_tgt_len):
|
||||
tmp = self.sampling()
|
||||
ret += tmp
|
||||
yield tmp
|
||||
if ret.endswith(end):
|
||||
break
|
||||
|
||||
def generate_with_print(self, end="</s>"):
|
||||
ret = b""
|
||||
for i in self.stream_generate(end=end):
|
||||
ret += i
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
||||
print("")
|
||||
return ret.decode(errors="replace")
|
||||
|
||||
|
||||
def generate(self, end="</s>"):
|
||||
text = b"".join(self.stream_generate(end=end))
|
||||
return text.decode(errors="replace")
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
|
||||
model.eval_string("""user: what is the color of the flag of UN?""")
|
||||
x = np.random.random((5120,10))# , dtype=np.float32)
|
||||
model.eval_float(x)
|
||||
model.eval_string("""assistant:""")
|
||||
for i in model.generate():
|
||||
print(i.decode(errors="replace"), end="", flush=True)
|
@ -1,71 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from transformers import CLIPVisionModel, CLIPImageProcessor
|
||||
from PIL import Image
|
||||
|
||||
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
|
||||
vision_tower = "openai/clip-vit-large-patch14"
|
||||
select_hidden_state_layer = -2
|
||||
# (vision_config.image_size // vision_config.patch_size) ** 2
|
||||
image_token_len = (224//14)**2
|
||||
|
||||
class Llava:
|
||||
def __init__(self, args):
|
||||
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
|
||||
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
|
||||
self.mm_projector = nn.Linear(1024, 5120)
|
||||
self.model = MyModel(["main", *args])
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)
|
||||
self.mm_projector.load_state_dict({
|
||||
"weight": state["model.mm_projector.weight"],
|
||||
"bias": state["model.mm_projector.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
||||
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
|
||||
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
|
||||
image_feature = select_hidden_state[:, 1:]
|
||||
embd_image = self.mm_projector(image_feature)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("user: ")
|
||||
self.model.eval_token(32003-2) # im_start
|
||||
self.model.eval_float(embd_image.T)
|
||||
for i in range(image_token_len-embd_image.shape[0]):
|
||||
self.model.eval_token(32003-3) # im_patch
|
||||
self.model.eval_token(32003-1) # im_end
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\nassistant: ")
|
||||
return self.model.generate_with_print()
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
# model form liuhaotian/LLaVA-13b-delta-v1-1
|
||||
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
|
||||
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
|
||||
# Also here can use pytorch_model-00003-of-00003.bin directly.
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"llava_projection.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
respose
|
||||
a.chat("what is the color of it?")
|
||||
|
||||
|
||||
|
@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
|
||||
sys.path.insert(0, minigpt4_path)
|
||||
from minigpt4.models.blip2 import Blip2Base
|
||||
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
|
||||
|
||||
|
||||
class MiniGPT4(Blip2Base):
|
||||
"""
|
||||
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
|
||||
"""
|
||||
def __init__(self,
|
||||
args,
|
||||
vit_model="eva_clip_g",
|
||||
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
|
||||
img_size=224,
|
||||
drop_path_rate=0,
|
||||
use_grad_checkpoint=False,
|
||||
vit_precision="fp32",
|
||||
freeze_vit=True,
|
||||
freeze_qformer=True,
|
||||
num_query_token=32,
|
||||
llama_model="",
|
||||
prompt_path="",
|
||||
prompt_template="",
|
||||
max_txt_len=32,
|
||||
end_sym='\n',
|
||||
low_resource=False, # use 8 bit and put vit in cpu
|
||||
device_8bit=0
|
||||
):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.low_resource = low_resource
|
||||
self.preprocessor = Blip2ImageEvalProcessor(img_size)
|
||||
|
||||
print('Loading VIT')
|
||||
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
|
||||
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
|
||||
)
|
||||
print('Loading VIT Done')
|
||||
print('Loading Q-Former')
|
||||
self.Qformer, self.query_tokens = self.init_Qformer(
|
||||
num_query_token, self.visual_encoder.num_features
|
||||
)
|
||||
self.Qformer.cls = None
|
||||
self.Qformer.bert.embeddings.word_embeddings = None
|
||||
self.Qformer.bert.embeddings.position_embeddings = None
|
||||
for layer in self.Qformer.bert.encoder.layer:
|
||||
layer.output = None
|
||||
layer.intermediate = None
|
||||
self.load_from_pretrained(url_or_filename=q_former_model)
|
||||
print('Loading Q-Former Done')
|
||||
self.llama_proj = nn.Linear(
|
||||
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
|
||||
)
|
||||
self.max_txt_len = max_txt_len
|
||||
self.end_sym = end_sym
|
||||
self.model = MyModel(["main", *args])
|
||||
# system prompt
|
||||
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
|
||||
"You will be able to see the image once I provide it to you. Please answer my questions."
|
||||
"###")
|
||||
|
||||
def encode_img(self, image):
|
||||
image = self.preprocessor(image)
|
||||
image = image.unsqueeze(0)
|
||||
device = image.device
|
||||
if self.low_resource:
|
||||
self.vit_to_cpu()
|
||||
image = image.to("cpu")
|
||||
|
||||
with self.maybe_autocast():
|
||||
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
|
||||
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
|
||||
|
||||
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
|
||||
query_output = self.Qformer.bert(
|
||||
query_embeds=query_tokens,
|
||||
encoder_hidden_states=image_embeds,
|
||||
encoder_attention_mask=image_atts,
|
||||
return_dict=True,
|
||||
)
|
||||
|
||||
inputs_llama = self.llama_proj(query_output.last_hidden_state)
|
||||
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
|
||||
return inputs_llama
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path)["model"]
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def chat(self, question):
|
||||
self.model.eval_string("Human: ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
def chat_with_image(self, image, question):
|
||||
with torch.no_grad():
|
||||
embd_image = self.encode_img(image)
|
||||
embd_image = embd_image.cpu().numpy()[0]
|
||||
self.model.eval_string("Human: <Img>")
|
||||
self.model.eval_float(embd_image.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
return self.model.generate_with_print(end="###")
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
|
||||
a.load_projection(os.path.join(
|
||||
os.path.dirname(__file__) ,
|
||||
"pretrained_minigpt4.pth"))
|
||||
respose = a.chat_with_image(
|
||||
Image.open("./media/llama1-logo.png").convert('RGB'),
|
||||
"what is the text in the picture?")
|
||||
a.chat("what is the color of it?")
|
@ -1,99 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from embd_input import MyModel
|
||||
import numpy as np
|
||||
from torch import nn
|
||||
import torch
|
||||
|
||||
# use PandaGPT path
|
||||
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
|
||||
imagebind_ckpt_path = "./models/panda_gpt/"
|
||||
|
||||
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
|
||||
from ImageBind.models import imagebind_model
|
||||
from ImageBind import data
|
||||
|
||||
ModalityType = imagebind_model.ModalityType
|
||||
max_tgt_len = 400
|
||||
|
||||
class PandaGPT:
|
||||
def __init__(self, args):
|
||||
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
|
||||
self.visual_encoder.eval()
|
||||
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
|
||||
self.max_tgt_len = max_tgt_len
|
||||
self.model = MyModel(["main", *args])
|
||||
self.generated_text = ""
|
||||
self.device = "cpu"
|
||||
|
||||
def load_projection(self, path):
|
||||
state = torch.load(path, map_location="cpu")
|
||||
self.llama_proj.load_state_dict({
|
||||
"weight": state["llama_proj.weight"],
|
||||
"bias": state["llama_proj.bias"]})
|
||||
|
||||
def eval_inputs(self, inputs):
|
||||
self.model.eval_string("<Img>")
|
||||
embds = self.extract_multimoal_feature(inputs)
|
||||
for i in embds:
|
||||
self.model.eval_float(i.T)
|
||||
self.model.eval_string("</Img> ")
|
||||
|
||||
def chat(self, question):
|
||||
return self.chat_with_image(None, question)
|
||||
|
||||
def chat_with_image(self, inputs, question):
|
||||
if self.generated_text == "":
|
||||
self.model.eval_string("###")
|
||||
self.model.eval_string(" Human: ")
|
||||
if inputs:
|
||||
self.eval_inputs(inputs)
|
||||
self.model.eval_string(question)
|
||||
self.model.eval_string("\n### Assistant:")
|
||||
ret = self.model.generate_with_print(end="###")
|
||||
self.generated_text += ret
|
||||
return ret
|
||||
|
||||
def extract_multimoal_feature(self, inputs):
|
||||
features = []
|
||||
for key in ["image", "audio", "video", "thermal"]:
|
||||
if key + "_paths" in inputs:
|
||||
embeds = self.encode_data(key, inputs[key+"_paths"])
|
||||
features.append(embeds)
|
||||
return features
|
||||
|
||||
def encode_data(self, data_type, data_paths):
|
||||
|
||||
type_map = {
|
||||
"image": ModalityType.VISION,
|
||||
"audio": ModalityType.AUDIO,
|
||||
"video": ModalityType.VISION,
|
||||
"thermal": ModalityType.THERMAL,
|
||||
}
|
||||
load_map = {
|
||||
"image": data.load_and_transform_vision_data,
|
||||
"audio": data.load_and_transform_audio_data,
|
||||
"video": data.load_and_transform_video_data,
|
||||
"thermal": data.load_and_transform_thermal_data
|
||||
}
|
||||
|
||||
load_function = load_map[data_type]
|
||||
key = type_map[data_type]
|
||||
|
||||
inputs = {key: load_function(data_paths, self.device)}
|
||||
with torch.no_grad():
|
||||
embeddings = self.visual_encoder(inputs)
|
||||
embeds = embeddings[key]
|
||||
embeds = self.llama_proj(embeds).cpu().numpy()
|
||||
return embeds
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
|
||||
a.load_projection("./models/panda_gpt/adapter_model.bin")
|
||||
a.chat_with_image(
|
||||
{"image_paths": ["./media/llama1-logo.png"]},
|
||||
"what is the text in the picture? 'llama' or 'lambda'?")
|
||||
a.chat("what is the color of it?")
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
@ -39,8 +39,8 @@ static gpt_params * g_params;
|
||||
static std::vector<llama_token> * g_input_tokens;
|
||||
static std::ostringstream * g_output_ss;
|
||||
static std::vector<llama_token> * g_output_tokens;
|
||||
static bool is_interacting = false;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
static void write_logfile(
|
||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||
@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
g_params = ¶ms;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
@ -358,36 +358,10 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
struct llama_grammar * grammar = NULL;
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
|
||||
if (!params.grammar.empty()) {
|
||||
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
||||
// will be empty (default) if there are parse errors
|
||||
if (parsed_grammar.rules.empty()) {
|
||||
return 1;
|
||||
}
|
||||
LOG_TEE("%s: grammar:\n", __func__);
|
||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||
LOG_TEE("\n");
|
||||
|
||||
{
|
||||
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
||||
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
||||
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||
if (params.infill) {
|
||||
printf("\n************\n");
|
||||
@ -430,7 +404,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
@ -549,7 +523,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@ -567,8 +541,11 @@ int main(int argc, char ** argv) {
|
||||
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
@ -600,7 +577,7 @@ int main(int argc, char ** argv) {
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// deal with eot token in infill mode
|
||||
if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
||||
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
||||
if(is_interacting && !params.interactive_first) {
|
||||
// print an eot token
|
||||
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
||||
@ -617,7 +594,7 @@ int main(int argc, char ** argv) {
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line, if so we use the old input
|
||||
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_prefix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
@ -627,7 +604,7 @@ int main(int argc, char ** argv) {
|
||||
buffer += line;
|
||||
} while (another_line);
|
||||
// check if we got an empty line
|
||||
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
||||
params.input_suffix = buffer;
|
||||
}
|
||||
buffer.clear();
|
||||
@ -640,7 +617,7 @@ int main(int argc, char ** argv) {
|
||||
process_escapes(params.input_suffix);
|
||||
}
|
||||
suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
@ -667,7 +644,7 @@ int main(int argc, char ** argv) {
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of text token in interactive mode
|
||||
else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
@ -740,15 +717,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if (n_past > 0) {
|
||||
if (is_interacting) {
|
||||
// reset grammar state if we're restarting generation
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
||||
grammar = llama_grammar_init(
|
||||
grammar_rules.data(), grammar_rules.size(),
|
||||
parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
llama_sampling_reset(ctx_sampling);
|
||||
}
|
||||
is_interacting = false;
|
||||
}
|
||||
@ -778,9 +747,7 @@ int main(int argc, char ** argv) {
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
if (grammar != NULL) {
|
||||
llama_grammar_free(grammar);
|
||||
}
|
||||
llama_sampling_free(ctx_sampling);
|
||||
llama_backend_free();
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
@ -112,8 +112,7 @@ static float get_f32(const gguf_context * ctx, const std::string & key) {
|
||||
static struct ggml_tensor * get_tensor(struct ggml_context * ctx, const std::string & name) {
|
||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
||||
if (!cur) {
|
||||
printf("unable to find tensor %s\n", name.c_str());
|
||||
throw std::runtime_error(format("unable to find tensor %s\n", name.c_str()));
|
||||
throw std::runtime_error(format("%s: unable to find tensor %s\n", __func__, name.c_str()));
|
||||
}
|
||||
|
||||
return cur;
|
||||
@ -136,7 +135,7 @@ static std::string get_ftype(int ftype) {
|
||||
case 8:
|
||||
return "q8_0";
|
||||
default:
|
||||
throw std::runtime_error(format("Unrecognized file type: %d\n", ftype));
|
||||
throw std::runtime_error(format("%s: Unrecognized file type: %d\n", __func__, ftype));
|
||||
}
|
||||
}
|
||||
|
||||
@ -462,6 +461,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
};
|
||||
|
||||
struct gguf_context * ctx = gguf_init_from_file(fname, params);
|
||||
if (!ctx) {
|
||||
throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
|
||||
}
|
||||
|
||||
if (verbosity >= 1) {
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
@ -16,13 +16,29 @@ checkpoint = torch.load(path)
|
||||
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
|
||||
|
||||
# store these tensors in a new dictionary and torch.save them
|
||||
projector = {name: checkpoint[name] for name in mm_tensors}
|
||||
projector = {name: checkpoint[name].float() for name in mm_tensors}
|
||||
torch.save(projector, f"{args.model}/llava.projector")
|
||||
|
||||
# remove these tensors from the checkpoint and save it again
|
||||
for name in mm_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# BakLLaVA models contain CLIP tensors in it
|
||||
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
|
||||
if len(clip_tensors) > 0:
|
||||
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
|
||||
torch.save(clip, f"{args.model}/llava.clip")
|
||||
|
||||
# remove these tensors
|
||||
for name in clip_tensors:
|
||||
del checkpoint[name]
|
||||
|
||||
# added tokens should be removed to be able to convert Mistral models
|
||||
if os.path.exists(f"{args.model}/added_tokens.json"):
|
||||
with open(f"{args.model}/added_tokens.json", "w") as f:
|
||||
f.write("{}\n")
|
||||
|
||||
|
||||
torch.save(checkpoint, path)
|
||||
|
||||
print("Done!")
|
||||
|
@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n
|
||||
|
||||
// TODO: use common/sampling.h
|
||||
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
// out of user input, sample next token
|
||||
const float temp = params.sampling_params.temp;
|
||||
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
|
||||
const float top_p = params.sampling_params.top_p;
|
||||
const float tfs_z = params.sampling_params.tfs_z;
|
||||
const float typical_p = params.sampling_params.typical_p;
|
||||
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
|
||||
// const float repeat_penalty = params.sampling_params.repeat_penalty;
|
||||
// const float alpha_presence = params.sampling_params.presence_penalty;
|
||||
// const float alpha_frequency = params.sampling_params.frequency_penalty;
|
||||
const int mirostat = params.sampling_params.mirostat;
|
||||
const float mirostat_tau = params.sampling_params.mirostat_tau;
|
||||
const float mirostat_eta = params.sampling_params.mirostat_eta;
|
||||
// const bool penalize_nl = params.sampling_params.penalize_nl;
|
||||
auto & sparams = params.sparams;
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = sparams.temp;
|
||||
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
|
||||
const float top_p = sparams.top_p;
|
||||
const float tfs_z = sparams.tfs_z;
|
||||
const float typical_p = sparams.typical_p;
|
||||
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
|
||||
// const float repeat_penalty = sparams.repeat_penalty;
|
||||
// const float alpha_presence = sparams.presence_penalty;
|
||||
// const float alpha_frequency = sparams.frequency_penalty;
|
||||
const int mirostat = sparams.mirostat;
|
||||
const float mirostat_tau = sparams.mirostat_tau;
|
||||
const float mirostat_eta = sparams.mirostat_eta;
|
||||
// const bool penalize_nl = sparams.penalize_nl;
|
||||
|
||||
llama_token id = 0;
|
||||
{
|
||||
auto logits = llama_get_logits(ctx_llama);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
|
||||
// Apply params.logit_bias map
|
||||
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
||||
logits[it->first] += it->second;
|
||||
}
|
||||
|
||||
@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
// TODO: Apply penalties
|
||||
// float nl_logit = logits[llama_token_nl(ctx)];
|
||||
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
// llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, repeat_penalty);
|
||||
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
||||
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
// last_n_repeat, alpha_frequency, alpha_presence);
|
||||
// if (!penalize_nl) {
|
||||
// logits[llama_token_nl(ctx)] = nl_logit;
|
||||
// }
|
||||
|
||||
if (temp <= 0) {
|
||||
// Greedy sampling
|
||||
|
@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
return 1;
|
||||
}
|
||||
llama_sampling_params & sparams = params.sampling_params;
|
||||
llama_sampling_params & sparams = params.sparams;
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("main", "log"));
|
||||
@ -415,8 +415,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
@ -459,7 +458,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> embd;
|
||||
std::vector<llama_token> embd_guidance;
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
@ -612,7 +611,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||
|
||||
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
||||
|
||||
@ -631,12 +630,9 @@ int main(int argc, char ** argv) {
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
|
||||
// GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
|
||||
// Most likely will remove this in the future to avoid exposing "prev"
|
||||
// Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
|
||||
// penalty will be applied only based on the tokens generated by the model.
|
||||
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
|
||||
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
|
||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||
// for the prompt, we don't apply grammar rules
|
||||
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
||||
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
@ -667,12 +663,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// if not currently processing queued inputs;
|
||||
if ((int) embd_inp.size() <= n_consumed) {
|
||||
// check for reverse prompt
|
||||
// check for reverse prompt in the last n_prev tokens
|
||||
if (!params.antiprompt.empty()) {
|
||||
std::string last_output;
|
||||
for (auto id : ctx_sampling->prev) {
|
||||
last_output += llama_token_to_piece(ctx, id);
|
||||
}
|
||||
const int n_prev = 32;
|
||||
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
|
||||
|
||||
is_antiprompt = false;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
@ -699,7 +693,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
// deal with end of text token in interactive mode
|
||||
if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
|
||||
if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
auto & client = clients[i];
|
||||
client.id = i;
|
||||
client.ctx_sampling = llama_sampling_init(params);
|
||||
client.ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens_system;
|
||||
@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
|
||||
|
||||
llama_sampling_accept(client.ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
|
||||
|
||||
if (client.n_decoded == 1) {
|
||||
// start measuring generation time after the first token to make sure all concurrent clients
|
||||
|
@ -181,8 +181,6 @@ struct slot_params
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
json input_prefix;
|
||||
@ -339,36 +337,6 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||
: default_value;
|
||||
}
|
||||
|
||||
// TODO: this is not needed, should reuse llama_sampling_init from common/sampling.h
|
||||
static struct llama_sampling_context * llama_sampling_init_srv(const struct llama_sampling_params &sparams, const std::string &grammar, int n_ctx)
|
||||
{
|
||||
struct llama_sampling_context * result = new llama_sampling_context();
|
||||
|
||||
result->params = sparams;
|
||||
result->grammar = nullptr;
|
||||
|
||||
// if there is a grammar, parse it
|
||||
if (!grammar.empty()) {
|
||||
result->parsed_grammar = grammar_parser::parse(grammar.c_str());
|
||||
|
||||
// will be empty (default) if there are parse errors
|
||||
if (result->parsed_grammar.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||
|
||||
result->grammar = llama_grammar_init(
|
||||
grammar_rules.data(),
|
||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||
}
|
||||
|
||||
result->prev.resize(n_ctx);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct llama_client_slot
|
||||
{
|
||||
int id;
|
||||
@ -410,7 +378,6 @@ struct llama_client_slot
|
||||
struct llama_sampling_params sparams;
|
||||
llama_sampling_context* ctx_sampling = nullptr;
|
||||
bool has_next_token = true;
|
||||
int max_context_size = 0;
|
||||
|
||||
// multimodal
|
||||
std::vector<slot_image> images;
|
||||
@ -435,7 +402,7 @@ struct llama_client_slot
|
||||
llama_sampling_free(ctx_sampling);
|
||||
}
|
||||
|
||||
ctx_sampling = llama_sampling_init_srv(sparams, params.grammar, max_context_size);
|
||||
ctx_sampling = llama_sampling_init(sparams);
|
||||
|
||||
for (slot_image &img : images)
|
||||
{
|
||||
@ -455,7 +422,7 @@ struct llama_client_slot
|
||||
llama_sampling_free(ctx_sampling);
|
||||
}
|
||||
|
||||
ctx_sampling = llama_sampling_init_srv(sparams, params.grammar, max_context_size);
|
||||
ctx_sampling = llama_sampling_init(sparams);
|
||||
return ctx_sampling != nullptr;
|
||||
}
|
||||
|
||||
@ -629,7 +596,7 @@ struct llama_server_context
|
||||
{
|
||||
llama_client_slot slot;
|
||||
slot.id = i;
|
||||
slot.max_context_size = max_ctx_per_slot;
|
||||
slot.sparams.n_prev = max_ctx_per_slot;
|
||||
slot.reset();
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, max_ctx_per_slot);
|
||||
@ -702,26 +669,26 @@ struct llama_server_context
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
llama_sampling_params default_sparams;
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
slot->sparams.repeat_last_n = json_value(data, "repeat_last_n", default_sparams.repeat_last_n);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.repeat_penalty = json_value(data, "repeat_penalty", default_sparams.repeat_penalty);
|
||||
slot->sparams.presence_penalty = json_value(data, "presence_penalty", default_sparams.presence_penalty);
|
||||
slot->sparams.frequency_penalty = json_value(data, "frequency_penalty", default_sparams.frequency_penalty);
|
||||
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot->params.grammar = json_value(data, "grammar", default_params.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||
slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
|
||||
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
|
||||
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
|
||||
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
|
||||
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
|
||||
slot->sparams.penalty_last_n = json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
slot->sparams.penalty_repeat = json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
|
||||
slot->sparams.penalty_freq = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
|
||||
slot->sparams.penalty_present = json_value(data, "presence_penalty", default_sparams.penalty_present);
|
||||
slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat);
|
||||
slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
|
||||
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
|
||||
slot->params.seed = json_value(data, "seed", default_params.seed);
|
||||
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||
|
||||
// infill
|
||||
if (data.count("input_prefix") != 0)
|
||||
@ -1157,10 +1124,10 @@ struct llama_server_context
|
||||
{"top_p", slot.sparams.top_p},
|
||||
{"tfs_z", slot.sparams.tfs_z},
|
||||
{"typical_p", slot.sparams.typical_p},
|
||||
{"repeat_last_n", slot.sparams.repeat_last_n},
|
||||
{"repeat_penalty", slot.sparams.repeat_penalty},
|
||||
{"presence_penalty", slot.sparams.presence_penalty},
|
||||
{"frequency_penalty", slot.sparams.frequency_penalty},
|
||||
{"repeat_last_n", slot.sparams.penalty_last_n},
|
||||
{"repeat_penalty", slot.sparams.penalty_repeat},
|
||||
{"presence_penalty", slot.sparams.penalty_present},
|
||||
{"frequency_penalty", slot.sparams.penalty_freq},
|
||||
{"mirostat", slot.sparams.mirostat},
|
||||
{"mirostat_tau", slot.sparams.mirostat_tau},
|
||||
{"mirostat_eta", slot.sparams.mirostat_eta},
|
||||
@ -1172,7 +1139,7 @@ struct llama_server_context
|
||||
{"stream", slot.params.stream},
|
||||
{"logit_bias", slot.sparams.logit_bias},
|
||||
{"n_probs", slot.sparams.n_probs},
|
||||
{"grammar", slot.params.grammar},
|
||||
{"grammar", slot.sparams.grammar},
|
||||
};
|
||||
}
|
||||
|
||||
@ -1707,7 +1674,7 @@ struct llama_server_context
|
||||
completion_token_output result;
|
||||
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
|
||||
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id);
|
||||
llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
|
||||
|
||||
if (slot.n_decoded == 1)
|
||||
{
|
||||
|
@ -112,16 +112,16 @@ int main(int argc, char ** argv) {
|
||||
bool has_eos = false;
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
||||
// draft sequence data
|
||||
std::vector<seq_draft> drafts(n_seq_dft);
|
||||
|
||||
params.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
|
||||
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
||||
params.sparams.temp = std::max(0.01f, params.sparams.temp);
|
||||
|
||||
for (int s = 0; s < n_seq_dft; ++s) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params);
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||
@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
|
||||
// sample from the target model
|
||||
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id);
|
||||
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
|
||||
|
||||
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
||||
|
||||
@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const int s = sa[is];
|
||||
|
||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
|
||||
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
|
||||
|
||||
drafts[s].tokens.push_back(id);
|
||||
|
||||
|
332
ggml-opencl.cpp
332
ggml-opencl.cpp
@ -1489,46 +1489,45 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
// copy data to device
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1589,73 +1588,70 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||
|
||||
size_t x_offset = 0;
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else if (i02 != pi02 || i03 != pi03) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
if (src0->backend == GGML_BACKEND_GPU) {
|
||||
x_offset = (i03 * ne02 + i02) * x_ne;
|
||||
} else {
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||
else {
|
||||
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
|
||||
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, x_offset, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1718,85 +1714,81 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||
size_t ev_idx = 0;
|
||||
std::vector<cl_event> events;
|
||||
|
||||
int64_t pi02 = -1;
|
||||
int64_t pi03 = -1;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
int64_t i03 = i13 / r3;
|
||||
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
int64_t i02 = i12 / r2;
|
||||
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
if (i02 != pi02 || i03 != pi03) {
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
// TODO: copy and dequantize src0 here when r3>1
|
||||
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||
pi02 = i02;
|
||||
pi03 = i03;
|
||||
}
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
events.emplace_back();
|
||||
|
||||
// wait for conversion
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, events.data() + ev_idx++);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||
d_Q = (cl_mem) src0->extra;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||
for (auto *event : events) {
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
if (!mul_mat_vec) {
|
||||
// convert src0 to fp32 on device
|
||||
const size_t global = x_ne / global_denom;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||
}
|
||||
|
||||
ev_idx = 0;
|
||||
events.clear();
|
||||
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
events.emplace_back();
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * local;
|
||||
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
|
||||
const cl_int ncols = ne00;
|
||||
events.emplace_back();
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
|
||||
} else { // CLBlast matrix matrix multiplication
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||
|
||||
// wait for conversion
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// compute
|
||||
events.emplace_back();
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, events.data() + ev_idx++);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||
for (auto *event : events) {
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
ev_idx = 0;
|
||||
events.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
22
ggml.c
22
ggml.c
@ -13537,7 +13537,7 @@ static void ggml_compute_forward_rope_f16(
|
||||
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
||||
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
||||
}
|
||||
} if (!is_neox) {
|
||||
} else if (!is_neox) {
|
||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
||||
const float cos_theta = cosf(theta);
|
||||
const float sin_theta = sinf(theta);
|
||||
@ -19170,6 +19170,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
||||
|
||||
if (idx == -1) {
|
||||
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
|
||||
fclose(fout);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -20844,7 +20845,7 @@ struct gguf_kv {
|
||||
};
|
||||
|
||||
struct gguf_header {
|
||||
uint32_t magic;
|
||||
char magic[4];
|
||||
uint32_t version;
|
||||
uint64_t n_tensors; // GGUFv2
|
||||
uint64_t n_kv; // GGUFv2
|
||||
@ -20914,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
|
||||
struct gguf_context * gguf_init_empty(void) {
|
||||
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
|
||||
|
||||
ctx->header.magic = GGUF_MAGIC;
|
||||
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
|
||||
ctx->header.version = GGUF_VERSION;
|
||||
ctx->header.n_tensors = 0;
|
||||
ctx->header.n_kv = 0;
|
||||
@ -20940,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
// offset from start of file
|
||||
size_t offset = 0;
|
||||
|
||||
uint32_t magic = 0;
|
||||
char magic[4];
|
||||
|
||||
// check the magic before making allocations
|
||||
{
|
||||
gguf_fread_el(file, &magic, sizeof(magic), &offset);
|
||||
|
||||
if (magic != GGUF_MAGIC) {
|
||||
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
||||
if (magic[i] != GGUF_MAGIC[i]) {
|
||||
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -20959,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
|
||||
// read the header
|
||||
{
|
||||
ctx->header.magic = magic;
|
||||
strncpy(ctx->header.magic, magic, 4);
|
||||
|
||||
|
||||
ctx->kv = NULL;
|
||||
ctx->infos = NULL;
|
||||
|
5
ggml.h
5
ggml.h
@ -231,8 +231,9 @@
|
||||
#define GGML_EXIT_SUCCESS 0
|
||||
#define GGML_EXIT_ABORTED 1
|
||||
|
||||
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
||||
#define GGUF_VERSION 2
|
||||
#define GGUF_MAGIC "GGUF"
|
||||
|
||||
#define GGUF_VERSION 3
|
||||
|
||||
#define GGUF_DEFAULT_ALIGNMENT 32
|
||||
|
||||
|
@ -19,9 +19,10 @@ import numpy as np
|
||||
#
|
||||
|
||||
GGUF_MAGIC = 0x46554747
|
||||
GGUF_VERSION = 2
|
||||
GGUF_VERSION = 3
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
|
||||
|
||||
# general
|
||||
KEY_GENERAL_ARCHITECTURE = "general.architecture"
|
||||
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
|
||||
@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
|
||||
Q6_K = 14
|
||||
Q8_K = 15
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
LITTLE = 0
|
||||
BIG = 1
|
||||
|
||||
|
||||
class GGUFValueType(IntEnum):
|
||||
UINT8 = 0
|
||||
@ -644,18 +649,41 @@ class GGUFWriter:
|
||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
|
||||
tensors: list[tuple[np.ndarray[Any, Any], int]]
|
||||
|
||||
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
|
||||
@property
|
||||
def pack_prefix(self):
|
||||
if self.endianess==GGUFEndian.LITTLE:
|
||||
return "<"
|
||||
else:
|
||||
return ">"
|
||||
|
||||
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
|
||||
self.fout = open(path, "wb")
|
||||
self.arch = arch
|
||||
self.endianess = endianess
|
||||
self._simple_value_packing = {
|
||||
GGUFValueType.UINT8: f"{self.pack_prefix}B",
|
||||
GGUFValueType.INT8: f"{self.pack_prefix}b",
|
||||
GGUFValueType.UINT16: f"{self.pack_prefix}H",
|
||||
GGUFValueType.INT16: f"{self.pack_prefix}h",
|
||||
GGUFValueType.UINT32: f"{self.pack_prefix}I",
|
||||
GGUFValueType.INT32: f"{self.pack_prefix}i",
|
||||
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
|
||||
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
|
||||
GGUFValueType.INT64: f"{self.pack_prefix}q",
|
||||
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
|
||||
GGUFValueType.BOOL: "?" ,
|
||||
}
|
||||
self.add_architecture()
|
||||
self.use_temp_file = use_temp_file
|
||||
self.tensors = []
|
||||
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
|
||||
print(f"This gguf file is for {endianess_str} only")
|
||||
|
||||
def write_header_to_file(self):
|
||||
self.fout.write(struct.pack("<I", GGUF_MAGIC))
|
||||
self.fout.write(struct.pack("<I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack("<Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack("<Q", self.kv_data_count))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
|
||||
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
|
||||
self.flush()
|
||||
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
|
||||
|
||||
@ -727,25 +755,12 @@ class GGUFWriter:
|
||||
self.add_key(key)
|
||||
self.add_val(val, GGUFValueType.ARRAY)
|
||||
|
||||
_simple_value_packing = {
|
||||
GGUFValueType.UINT8: "<B",
|
||||
GGUFValueType.INT8: "<b",
|
||||
GGUFValueType.UINT16: "<H",
|
||||
GGUFValueType.INT16: "<h",
|
||||
GGUFValueType.UINT32: "<I",
|
||||
GGUFValueType.INT32: "<i",
|
||||
GGUFValueType.FLOAT32: "<f",
|
||||
GGUFValueType.UINT64: "<Q",
|
||||
GGUFValueType.INT64: "<q",
|
||||
GGUFValueType.FLOAT64: "<d",
|
||||
GGUFValueType.BOOL: "?" ,
|
||||
}
|
||||
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
|
||||
if vtype is None:
|
||||
vtype = GGUFValueType.get_type(val)
|
||||
|
||||
if add_vtype:
|
||||
self.kv_data += struct.pack("<I", vtype)
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
|
||||
self.kv_data_count += 1
|
||||
|
||||
pack_fmt = self._simple_value_packing.get(vtype)
|
||||
@ -753,14 +768,14 @@ class GGUFWriter:
|
||||
self.kv_data += struct.pack(pack_fmt, val)
|
||||
elif vtype == GGUFValueType.STRING:
|
||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
||||
self.kv_data += struct.pack("<Q", len(encoded_val))
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
|
||||
self.kv_data += encoded_val
|
||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
|
||||
ltype = GGUFValueType.get_type(val[0])
|
||||
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
|
||||
raise ValueError("All items in a GGUF array should be of the same type")
|
||||
self.kv_data += struct.pack("<I", ltype)
|
||||
self.kv_data += struct.pack("<Q", len(val))
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
|
||||
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
|
||||
for item in val:
|
||||
self.add_val(item, add_vtype=False)
|
||||
else:
|
||||
@ -774,22 +789,24 @@ class GGUFWriter:
|
||||
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
|
||||
|
||||
encoded_name = name.encode("utf8")
|
||||
self.ti_data += struct.pack("<Q", len(encoded_name))
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
|
||||
self.ti_data += encoded_name
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += struct.pack("<I", n_dims)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
|
||||
for i in range(n_dims):
|
||||
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
self.ti_data += struct.pack("<I", dtype)
|
||||
self.ti_data += struct.pack("<Q", self.offset_tensor)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
|
||||
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
|
||||
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
|
||||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
|
||||
if self.endianess == GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
if self.use_temp_file and self.temp_file is None:
|
||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
||||
fp.seek(0)
|
||||
@ -815,6 +832,8 @@ class GGUFWriter:
|
||||
fp.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
|
||||
if self.endianess==GGUFEndian.BIG:
|
||||
tensor.byteswap(inplace=True)
|
||||
self.write_padding(self.fout, self.fout.tell())
|
||||
tensor.tofile(self.fout)
|
||||
self.write_padding(self.fout, tensor.nbytes)
|
||||
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if !defined(__riscv)
|
||||
#if !defined(__riscv) && !defined(__s390__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
101
llama.cpp
101
llama.cpp
@ -1018,8 +1018,8 @@ enum e_model {
|
||||
};
|
||||
|
||||
static const size_t kB = 1024;
|
||||
static const size_t MB = kB*kB;
|
||||
static const size_t GB = kB*kB*kB;
|
||||
static const size_t MB = 1024*kB;
|
||||
static const size_t GB = 1024*MB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
@ -1042,21 +1042,21 @@ struct llama_hparams {
|
||||
float f_max_alibi_bias;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
if (this->n_ctx_train != other.n_ctx_train) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
if (this->n_embd != other.n_embd) return true;
|
||||
if (this->n_head != other.n_head) return true;
|
||||
if (this->n_head_kv != other.n_head_kv) return true;
|
||||
if (this->n_layer != other.n_layer) return true;
|
||||
if (this->n_rot != other.n_rot) return true;
|
||||
if (this->n_ff != other.n_ff) return true;
|
||||
|
||||
const float EPSILON = 1e-9;
|
||||
|
||||
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
||||
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
||||
|
||||
return false;
|
||||
@ -1195,11 +1195,11 @@ struct llama_vocab {
|
||||
id special_sep_id = -1;
|
||||
id special_pad_id = -1;
|
||||
|
||||
id linefeed_id = 13;
|
||||
id linefeed_id = 13;
|
||||
id special_prefix_id = 32007;
|
||||
id special_middle_id = 32009;
|
||||
id special_suffix_id = 32008;
|
||||
id special_eot_id = 32010;
|
||||
id special_eot_id = 32010;
|
||||
|
||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||
replace_all(token_left, " ", "\u0120");
|
||||
@ -1359,10 +1359,7 @@ static bool llama_kv_cache_init(
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
// TODO: this should be:
|
||||
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
// change it and test that it works
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
@ -6324,7 +6321,6 @@ struct llm_tokenizer_bpe {
|
||||
llm_symbol sym;
|
||||
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
||||
sym.text = word.c_str() + offset;
|
||||
sym.n = 1;
|
||||
sym.n = char_len;
|
||||
offset += sym.n;
|
||||
sym.prev = index - 1;
|
||||
@ -7054,7 +7050,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
std::vector<llama_grammar_candidate> rejects;
|
||||
|
||||
if (stack.empty()) {
|
||||
for (auto tok : candidates) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
||||
rejects.push_back(tok);
|
||||
}
|
||||
@ -7065,7 +7061,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
const llama_grammar_element * stack_pos = stack.back();
|
||||
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
for (auto tok : candidates) {
|
||||
for (const auto & tok : candidates) {
|
||||
if (*tok.code_points == 0) {
|
||||
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
||||
// that cannot satisfy this position in grammar
|
||||
@ -7091,7 +7087,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
||||
|
||||
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
||||
for (auto tok : next_rejects) {
|
||||
for (const auto & tok : next_rejects) {
|
||||
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
||||
}
|
||||
|
||||
@ -7418,37 +7414,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
||||
llama_sample_temp(ctx, candidates_p, temp);
|
||||
}
|
||||
|
||||
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
|
||||
if (last_tokens_size == 0 || penalty == 1.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
||||
if (token_iter == last_tokens + last_tokens_size) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (candidates->data[i].logit <= 0) {
|
||||
candidates->data[i].logit *= penalty;
|
||||
} else {
|
||||
candidates->data[i].logit /= penalty;
|
||||
}
|
||||
}
|
||||
|
||||
candidates->sorted = false;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
}
|
||||
|
||||
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
||||
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
||||
void llama_sample_repetition_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present) {
|
||||
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -7456,19 +7430,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
||||
|
||||
// Create a frequency map to count occurrences of each token in last_tokens
|
||||
std::unordered_map<llama_token, int> token_count;
|
||||
for (size_t i = 0; i < last_tokens_size; ++i) {
|
||||
token_count[last_tokens_p[i]]++;
|
||||
for (size_t i = 0; i < penalty_last_n; ++i) {
|
||||
token_count[last_tokens[i]]++;
|
||||
}
|
||||
|
||||
// Apply frequency and presence penalties to the candidates
|
||||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
auto token_iter = token_count.find(candidates->data[i].id);
|
||||
const auto token_iter = token_count.find(candidates->data[i].id);
|
||||
if (token_iter == token_count.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int count = token_iter->second;
|
||||
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
|
||||
const int count = token_iter->second;
|
||||
|
||||
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
||||
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
||||
if (candidates->data[i].logit <= 0) {
|
||||
candidates->data[i].logit *= penalty_repeat;
|
||||
} else {
|
||||
candidates->data[i].logit /= penalty_repeat;
|
||||
}
|
||||
|
||||
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
||||
}
|
||||
|
||||
candidates->sorted = false;
|
||||
|
16
llama.h
16
llama.h
@ -560,21 +560,15 @@ extern "C" {
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
||||
|
||||
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
||||
LLAMA_API void llama_sample_repetition_penalty(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float penalty);
|
||||
|
||||
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
||||
LLAMA_API void llama_sample_frequency_and_presence_penalties(
|
||||
LLAMA_API void llama_sample_repetition_penalties(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
const llama_token * last_tokens,
|
||||
size_t last_tokens_size,
|
||||
float alpha_frequency,
|
||||
float alpha_presence);
|
||||
size_t penalty_last_n,
|
||||
float penalty_repeat,
|
||||
float penalty_freq,
|
||||
float penalty_present);
|
||||
|
||||
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
||||
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
|
||||
|
@ -4,7 +4,9 @@
|
||||
|
||||
#undef NDEBUG
|
||||
#include <cassert>
|
||||
#if !defined(__riscv) && !defined(__s390__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
@ -8,11 +8,9 @@
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
static void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||
@ -21,7 +19,6 @@ static void dump(const llama_token_data_array * candidates) {
|
||||
|
||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@ -37,13 +34,12 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
|
||||
llama_sample_top_k(nullptr, &candidates_p, k, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@ -59,13 +55,12 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
||||
llama_sample_top_p(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@ -80,13 +75,12 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
|
||||
llama_sample_tail_free(nullptr, &candidates_p, z, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@ -101,18 +95,17 @@ static void test_typical(const std::vector<float> & probs, const std::vector<flo
|
||||
llama_sample_typical(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_repetition_penalty(
|
||||
static void test_repetition_penalties(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float penalty
|
||||
const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
GGML_ASSERT(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
@ -125,41 +118,13 @@ static void test_repetition_penalty(
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_repetition_penalty(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), penalty);
|
||||
llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void test_frequency_presence_penalty(
|
||||
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||
const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
|
||||
) {
|
||||
assert(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
// DUMP(&candidates_p);
|
||||
|
||||
assert(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
@ -181,13 +146,13 @@ int main(void) {
|
||||
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
|
||||
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
|
||||
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||
test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
|
||||
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
|
||||
test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user