diff --git a/common/common.cpp b/common/common.cpp index aea123cf4..a30d6920e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -910,12 +910,13 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - int err = llama_control_vector_apply(lctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); + int err = llama_apply_adapter_cvec( + lctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); if (err) { llama_free(lctx); llama_model_free(model); @@ -926,8 +927,8 @@ struct common_init_result common_init_from_params(common_params & params) { // load and optionally apply lora adapters for (auto & la : params.lora_adapters) { - llama_lora_adapter_ptr lora; - lora.reset(llama_lora_adapter_init(model, la.path.c_str())); + llama_adapter_lora_ptr lora; + lora.reset(llama_adapter_lora_init(model, la.path.c_str())); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); @@ -940,7 +941,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (!params.lora_init_without_apply) { - common_lora_adapters_apply(lctx, params.lora_adapters); + common_set_adapter_lora(lctx, params.lora_adapters); } if (params.sampling.ignore_eos && llama_token_eos(vocab) == LLAMA_TOKEN_NULL) { @@ -1008,11 +1009,11 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora) { - llama_lora_adapter_clear(ctx); +void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { + llama_clear_adapter_lora(ctx); for (auto & la : lora) { if (la.scale != 0.0f) { - llama_lora_adapter_set(ctx, la.ptr, la.scale); + llama_set_adapter_lora(ctx, la.ptr, la.scale); } } } diff --git a/common/common.h b/common/common.h index 74efcbab9..d523948b0 100644 --- a/common/common.h +++ b/common/common.h @@ -24,11 +24,11 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" -struct common_lora_adapter_info { +struct common_adapter_lora_info { std::string path; float scale; - struct llama_lora_adapter * ptr; + struct llama_adapter_lora * ptr; }; using llama_tokens = std::vector; @@ -246,8 +246,8 @@ struct common_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) - std::vector lora_adapters; // lora adapter path with user defined scale + bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -481,7 +481,7 @@ struct common_init_result { llama_model_ptr model; llama_context_ptr context; - std::vector lora; + std::vector lora; }; struct common_init_result common_init_from_params(common_params & params); @@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf( const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters -void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora); +void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); // // Batch utils diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index d5dcd20a0..124184f77 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -130,7 +130,7 @@ struct lora_merge_ctx { lora_merge_ctx( std::string & base_fname, - std::vector & lora_files, + std::vector & lora_files, std::string & outfile, int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) { fout.exceptions(std::ofstream::failbit); // fail fast on write errors diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4d2810206..7096a883e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -98,7 +98,7 @@ struct slot_params { int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit - std::vector lora; + std::vector lora; std::vector antiprompt; std::vector response_fields; @@ -198,7 +198,7 @@ struct server_task { bool metrics_reset_bucket = false; // used by SERVER_TASK_TYPE_SET_LORA - std::vector set_lora; + std::vector set_lora; server_task(server_task_type type) : type(type) {} @@ -1133,7 +1133,7 @@ struct server_slot { common_speculative * spec = nullptr; - std::vector lora; + std::vector lora; // the index relative to completion multi-task request size_t index = 0; @@ -2934,7 +2934,7 @@ struct server_context { // make sure we're in the right embedding mode llama_set_embeddings(ctx, slot_batched->is_non_causal()); // apply lora, only need to do it once per batch - common_lora_adapters_apply(ctx, slot_batched->lora); + common_set_adapter_lora(ctx, slot_batched->lora); } // process the created batch of tokens diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a73fa5a09..e373c44be 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -804,8 +804,8 @@ static std::vector get_token_probabilities(llama_context * ctx } static bool are_lora_equal( - const std::vector & l1, - const std::vector & l2) { + const std::vector & l1, + const std::vector & l2) { if (l1.size() != l2.size()) { return false; } @@ -819,10 +819,10 @@ static bool are_lora_equal( } // parse lora config from JSON request, returned a copy of lora_base with updated scale -static std::vector parse_lora_request( - const std::vector & lora_base, +static std::vector parse_lora_request( + const std::vector & lora_base, const json & data) { - std::vector lora(lora_base); + std::vector lora(lora_base); int max_idx = lora.size(); // clear existing value diff --git a/include/llama-cpp.h b/include/llama-cpp.h index 11306b17f..8f6368177 100644 --- a/include/llama-cpp.h +++ b/include/llama-cpp.h @@ -20,11 +20,11 @@ struct llama_sampler_deleter { void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); } }; -struct llama_lora_adapter_deleter { - void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); } +struct llama_adapter_lora_deleter { + void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); } }; typedef std::unique_ptr llama_model_ptr; typedef std::unique_ptr llama_context_ptr; typedef std::unique_ptr llama_sampler_ptr; -typedef std::unique_ptr llama_lora_adapter_ptr; +typedef std::unique_ptr llama_adapter_lora_ptr; diff --git a/include/llama.h b/include/llama.h index eb6d1031b..302df87e9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -385,8 +385,7 @@ extern "C" { } llama_chat_message; // lora adapter - // TODO: rename to llama_adapter_lora - struct llama_lora_adapter; + struct llama_adapter_lora; // Helpers for getting default parameters // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172) @@ -520,34 +519,31 @@ extern "C" { // // Load a LoRA adapter from file - // TODO: rename to llama_adapter_lora_init - LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init( struct llama_model * model, const char * path_lora); + // Manually free a LoRA adapter + // Note: loaded adapters will be free when the associated model is deleted + LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter); + + // The following functions operate on a llama_context, hence the naming: llama_verb_... + // Add a loaded LoRA adapter to given context // This will not modify model's weight - // TODO: rename to llama_set_adapter_lora - LLAMA_API int32_t llama_lora_adapter_set( + LLAMA_API int32_t llama_set_adapter_lora( struct llama_context * ctx, - struct llama_lora_adapter * adapter, + struct llama_adapter_lora * adapter, float scale); // Remove a specific LoRA adapter from given context // Return -1 if the adapter is not present in the context - // TODO: rename to llama_rm_adapter_lora - LLAMA_API int32_t llama_lora_adapter_remove( + LLAMA_API int32_t llama_rm_adapter_lora( struct llama_context * ctx, - struct llama_lora_adapter * adapter); + struct llama_adapter_lora * adapter); // Remove all LoRA adapters from given context - // TODO: rename to llama_clear_adapter_lora - LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx); - - // Manually free a LoRA adapter - // Note: loaded adapters will be free when the associated model is deleted - // TODO: rename to llama_adapter_lora_free - LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter); + LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. @@ -555,9 +551,8 @@ extern "C" { // to an n_embd x n_layers buffer starting from layer 1. // il_start and il_end are the layer range the vector should apply to (both inclusive) // See llama_control_vector_load in common to load a control vector. - // TODO: rename to llama_adapter_cvec_apply - LLAMA_API int32_t llama_control_vector_apply( - struct llama_context * lctx, + LLAMA_API int32_t llama_apply_adapter_cvec( + struct llama_context * ctx, const float * data, size_t len, int32_t n_embd, diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 3925b8970..8a0800463 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -1,5 +1,6 @@ #include "llama-adapter.h" +#include "llama-impl.h" #include "llama-mmap.h" #include "llama-model.h" @@ -10,7 +11,7 @@ // vec -struct ggml_tensor * llama_control_vector::tensor_for(int il) const { +struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const { if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) { return nullptr; } @@ -18,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const { return tensors[il]; } -struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { +struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const { ggml_tensor * layer_dir = tensor_for(il); if (layer_dir != nullptr) { cur = ggml_add(ctx, cur, layer_dir); @@ -27,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s return cur; } -static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { +bool llama_adapter_cvec::init(const llama_model & model) { const auto & hparams = model.hparams; - GGML_ASSERT(cvec.tensors.empty()); - GGML_ASSERT(cvec.ctxs.empty()); - GGML_ASSERT(cvec.bufs.empty()); + GGML_ASSERT(tensors.empty()); + GGML_ASSERT(ctxs.empty()); + GGML_ASSERT(bufs.empty()); // create a context for each buffer type std::map ctx_map; @@ -51,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const } ctx_map[buft] = ctx; - cvec.ctxs.emplace_back(ctx); + ctxs.emplace_back(ctx); return ctx; } @@ -60,8 +61,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const }; // make tensors - cvec.tensors.reserve(hparams.n_layer); - cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0 + tensors.reserve(hparams.n_layer); + tensors.push_back(nullptr); // there's never a tensor for layer 0 for (size_t il = 1; il < hparams.n_layer; il++) { ggml_backend_buffer_type_t buft = model.select_buft(il); ggml_context * ctx = ctx_for_buft(buft); @@ -70,11 +71,11 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return false; } ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd); - cvec.tensors.push_back(tensor); + tensors.push_back(tensor); } // allocate tensors / buffers and zero - cvec.bufs.reserve(ctx_map.size()); + bufs.reserve(ctx_map.size()); for (auto it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; ggml_context * ctx = it.second; @@ -84,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const return false; } ggml_backend_buffer_clear(buf, 0); - cvec.bufs.emplace_back(buf); + bufs.emplace_back(buf); } return true; } -int32_t llama_control_vector_apply( - struct llama_control_vector & cvec, +int32_t llama_adapter_cvec::apply( const llama_model & model, const float * data, size_t len, @@ -102,8 +102,8 @@ int32_t llama_control_vector_apply( if (data == nullptr) { // disable the current control vector (but leave allocated for later) - cvec.layer_start = -1; - cvec.layer_end = -1; + layer_start = -1; + layer_end = -1; return 0; } @@ -112,21 +112,21 @@ int32_t llama_control_vector_apply( return 1; } - if (cvec.tensors.empty()) { - if (!llama_control_vector_init(cvec, model)) { + if (tensors.empty()) { + if (!init(model)) { return 1; } } - cvec.layer_start = il_start; - cvec.layer_end = il_end; + layer_start = il_start; + layer_end = il_end; for (size_t il = 1; il < hparams.n_layer; il++) { - assert(cvec.tensors[il] != nullptr); + assert(tensors[il] != nullptr); const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present if (off + n_embd <= len) { - ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il])); + ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il])); } } @@ -135,7 +135,7 @@ int32_t llama_control_vector_apply( // lora -llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) { +llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) { const std::string name(w->name); const auto pos = ab_map.find(name); @@ -146,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) { return nullptr; } -void llama_lora_adapter_free(struct llama_lora_adapter * adapter) { - delete adapter; -} - -static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { +static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) { LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora); ggml_context * ctx_init; @@ -222,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char }; // bundle lora_a and lora_b into pairs - std::map ab_map; + std::map ab_map; auto str_endswith = [](const std::string & str, const std::string & suffix) { return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; }; @@ -232,14 +228,14 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char if (str_endswith(name, ".lora_a")) { replace_all(name, ".lora_a", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(cur, nullptr); + ab_map[name] = llama_adapter_lora_weight(cur, nullptr); } else { ab_map[name].a = cur; } } else if (str_endswith(name, ".lora_b")) { replace_all(name, ".lora_b", ""); if (ab_map.find(name) == ab_map.end()) { - ab_map[name] = llama_lora_weight(nullptr, cur); + ab_map[name] = llama_adapter_lora_weight(nullptr, cur); } else { ab_map[name].b = cur; } @@ -255,7 +251,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char // add tensors for (auto & it : ab_map) { const std::string & name = it.first; - llama_lora_weight & w = it.second; + llama_adapter_lora_weight & w = it.second; bool is_token_embd = str_endswith(name, "token_embd.weight"); if (!w.a || !w.b) { @@ -289,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); ggml_set_name(tensor_a, w.a->name); ggml_set_name(tensor_b, w.b->name); - adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b); + adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b); } // allocate tensors / buffers and zero @@ -331,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2); } -struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) { - struct llama_lora_adapter * adapter = new llama_lora_adapter(); +struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) { + struct llama_adapter_lora * adapter = new llama_adapter_lora(); try { - llama_lora_adapter_init_impl(*model, path_lora, *adapter); + llama_adapter_lora_init_impl(*model, path_lora, *adapter); return adapter; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); @@ -345,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, return nullptr; } + +void llama_adapter_lora_free(struct llama_adapter_lora * adapter) { + delete adapter; +} diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 3448656b1..603fa08f6 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -1,73 +1,74 @@ #pragma once -#include "llama-impl.h" -#include "llama-hparams.h" +#include "llama.h" #include "ggml-cpp.h" +#include #include #include +// TODO: pimpl + // // llama_adapter_cvec // -// TODO: rename to llama_adapter_cvec -struct llama_control_vector { - std::vector ctxs; - std::vector bufs; +struct llama_adapter_cvec { + struct ggml_tensor * tensor_for(int il) const; - std::vector tensors; // per layer + struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; + + int32_t apply( + const llama_model & model, + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + +private: + bool init(const llama_model & model); int32_t layer_start = -1; int32_t layer_end = -1; - struct ggml_tensor * tensor_for(int il) const; + std::vector ctxs; + std::vector bufs; - struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; + std::vector tensors; // per layer }; -int32_t llama_control_vector_apply( - struct llama_control_vector & cvec, - const llama_model & model, - const float * data, - size_t len, - int32_t n_embd, - int32_t il_start, - int32_t il_end); - // // llama_adapter_lora // -// TODO: rename to llama_adapter_lora_weight -struct llama_lora_weight { +struct llama_adapter_lora_weight { struct ggml_tensor * a = nullptr; struct ggml_tensor * b = nullptr; // get actual scale based on rank and alpha - float get_scale(float alpha, float adapter_scale) { + float get_scale(float alpha, float adapter_scale) const { const float rank = (float) b->ne[0]; const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale; return scale; } - llama_lora_weight() = default; - llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} + llama_adapter_lora_weight() = default; + llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {} }; -// TODO: rename to llama_adapter_lora -struct llama_lora_adapter { +struct llama_adapter_lora { // map tensor name to lora_a_b - std::unordered_map ab_map; + std::unordered_map ab_map; std::vector ctxs; std::vector bufs; float alpha; - llama_lora_adapter() = default; - ~llama_lora_adapter() = default; + llama_adapter_lora() = default; + ~llama_adapter_lora() = default; - llama_lora_weight * get_weight(struct ggml_tensor * w); + llama_adapter_lora_weight * get_weight(struct ggml_tensor * w); }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e20482516..3cb06ee40 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1,5 +1,6 @@ #include "llama-context.h" +#include "llama-impl.h" #include "llama-mmap.h" #include diff --git a/src/llama-context.h b/src/llama-context.h index 0d163c470..a9268b292 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -22,12 +22,12 @@ struct llama_context { const struct llama_model & model; - struct llama_cparams cparams; - struct llama_sbatch sbatch; // TODO: revisit if needed - struct llama_kv_cache kv_self; - struct llama_control_vector cvec; + struct llama_cparams cparams; + struct llama_sbatch sbatch; // TODO: revisit if needed + struct llama_kv_cache kv_self; + struct llama_adapter_cvec cvec; - std::unordered_map lora_adapters; + std::unordered_map lora; std::vector backends; std::vector> set_n_threads_fns; diff --git a/src/llama.cpp b/src/llama.cpp index 76506abc1..a1c62eb31 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -125,16 +125,16 @@ static struct ggml_tensor * llm_build_inp_embd( inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); // apply lora for embedding tokens if needed - for (auto & it : lctx.lora_adapters) { - struct llama_lora_weight * lora = it.first->get_weight(tok_embd); - if (lora == nullptr) { + for (auto & it : lctx.lora) { + struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd); + if (lw == nullptr) { continue; } const float adapter_scale = it.second; - const float scale = lora->get_scale(it.first->alpha, adapter_scale); + const float scale = lw->get_scale(it.first->alpha, adapter_scale); struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat( - ctx, lora->b, // non-transposed lora_b - ggml_get_rows(ctx, lora->a, lctx.inp_tokens) + ctx, lw->b, // non-transposed lora_b + ggml_get_rows(ctx, lw->a, lctx.inp_tokens) ), scale); inpL = ggml_add(ctx, inpL, inpL_delta); } @@ -205,16 +205,16 @@ static struct ggml_tensor * llm_build_lora_mm( struct ggml_tensor * w, struct ggml_tensor * cur) { struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - for (auto & it : lctx.lora_adapters) { - struct llama_lora_weight * lora = it.first->get_weight(w); - if (lora == nullptr) { + for (auto & it : lctx.lora) { + struct llama_adapter_lora_weight * lw = it.first->get_weight(w); + if (lw == nullptr) { continue; } const float adapter_scale = it.second; - const float scale = lora->get_scale(it.first->alpha, adapter_scale); + const float scale = lw->get_scale(it.first->alpha, adapter_scale); struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lora->b, - ggml_mul_mat(ctx0, lora->a, cur) + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) ); ab_cur = ggml_scale(ctx0, ab_cur, scale); res = ggml_add(ctx0, res, ab_cur); @@ -230,17 +230,17 @@ static struct ggml_tensor * llm_build_lora_mm_id( struct ggml_tensor * cur, // struct ggml_tensor * b struct ggml_tensor * ids) { struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (auto & it : lctx.lora_adapters) { - struct llama_lora_weight * lora = it.first->get_weight(w); - if (lora == nullptr) { + for (auto & it : lctx.lora) { + struct llama_adapter_lora_weight * lw = it.first->get_weight(w); + if (lw == nullptr) { continue; } const float alpha = it.first->alpha; - const float rank = (float) lora->b->ne[0]; + const float rank = (float) lw->b->ne[0]; const float scale = alpha ? it.second * alpha / rank : it.second; struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lora->b, - ggml_mul_mat_id(ctx0, lora->a, cur, ids), + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), ids ); ab_cur = ggml_scale(ctx0, ab_cur, scale); @@ -9243,39 +9243,38 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) { } } -int32_t llama_lora_adapter_set( +int32_t llama_set_adapter_lora( struct llama_context * ctx, - struct llama_lora_adapter * adapter, + struct llama_adapter_lora * adapter, float scale) { - ctx->lora_adapters[adapter] = scale; + ctx->lora[adapter] = scale; return 0; } -int32_t llama_lora_adapter_remove( +int32_t llama_rm_adapter_lora( struct llama_context * ctx, - struct llama_lora_adapter * adapter) { - auto pos = ctx->lora_adapters.find(adapter); - if (pos != ctx->lora_adapters.end()) { - ctx->lora_adapters.erase(pos); + struct llama_adapter_lora * adapter) { + auto pos = ctx->lora.find(adapter); + if (pos != ctx->lora.end()) { + ctx->lora.erase(pos); return 0; } return -1; } -void llama_lora_adapter_clear(struct llama_context * ctx) { - ctx->lora_adapters.clear(); +void llama_clear_adapter_lora(struct llama_context * ctx) { + ctx->lora.clear(); } -// TODO: tmp -int32_t llama_control_vector_apply( - struct llama_context * lctx, +int32_t llama_apply_adapter_cvec( + struct llama_context * ctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { - return llama_control_vector_apply(lctx->cvec, lctx->model, data, len, n_embd, il_start, il_end); + return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); } //