mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
parent
aeeb9420a3
commit
d1af0e9b75
@ -910,7 +910,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
int err = llama_control_vector_apply(lctx,
|
int err = llama_apply_adapter_cvec(
|
||||||
|
lctx,
|
||||||
cvec.data.data(),
|
cvec.data.data(),
|
||||||
cvec.data.size(),
|
cvec.data.size(),
|
||||||
cvec.n_embd,
|
cvec.n_embd,
|
||||||
@ -926,8 +927,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
|
|
||||||
// load and optionally apply lora adapters
|
// load and optionally apply lora adapters
|
||||||
for (auto & la : params.lora_adapters) {
|
for (auto & la : params.lora_adapters) {
|
||||||
llama_lora_adapter_ptr lora;
|
llama_adapter_lora_ptr lora;
|
||||||
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
|
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||||
if (lora == nullptr) {
|
if (lora == nullptr) {
|
||||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
@ -940,7 +941,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
common_lora_adapters_apply(lctx, params.lora_adapters);
|
common_set_adapter_lora(lctx, params.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sampling.ignore_eos && llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
|
if (params.sampling.ignore_eos && llama_token_eos(vocab) == LLAMA_TOKEN_NULL) {
|
||||||
@ -1008,11 +1009,11 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
|
||||||
llama_lora_adapter_clear(ctx);
|
llama_clear_adapter_lora(ctx);
|
||||||
for (auto & la : lora) {
|
for (auto & la : lora) {
|
||||||
if (la.scale != 0.0f) {
|
if (la.scale != 0.0f) {
|
||||||
llama_lora_adapter_set(ctx, la.ptr, la.scale);
|
llama_set_adapter_lora(ctx, la.ptr, la.scale);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,11 +24,11 @@
|
|||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
|
||||||
struct common_lora_adapter_info {
|
struct common_adapter_lora_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
float scale;
|
float scale;
|
||||||
|
|
||||||
struct llama_lora_adapter * ptr;
|
struct llama_adapter_lora * ptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
using llama_tokens = std::vector<llama_token>;
|
using llama_tokens = std::vector<llama_token>;
|
||||||
@ -246,8 +246,8 @@ struct common_params {
|
|||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
|
||||||
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
|
||||||
|
|
||||||
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
@ -481,7 +481,7 @@ struct common_init_result {
|
|||||||
llama_model_ptr model;
|
llama_model_ptr model;
|
||||||
llama_context_ptr context;
|
llama_context_ptr context;
|
||||||
|
|
||||||
std::vector<llama_lora_adapter_ptr> lora;
|
std::vector<llama_adapter_lora_ptr> lora;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_init_result common_init_from_params(common_params & params);
|
struct common_init_result common_init_from_params(common_params & params);
|
||||||
@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
|
|||||||
const struct llama_model_params & params);
|
const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
|
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
@ -130,7 +130,7 @@ struct lora_merge_ctx {
|
|||||||
|
|
||||||
lora_merge_ctx(
|
lora_merge_ctx(
|
||||||
std::string & base_fname,
|
std::string & base_fname,
|
||||||
std::vector<common_lora_adapter_info> & lora_files,
|
std::vector<common_adapter_lora_info> & lora_files,
|
||||||
std::string & outfile,
|
std::string & outfile,
|
||||||
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
@ -98,7 +98,7 @@ struct slot_params {
|
|||||||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
std::vector<common_lora_adapter_info> lora;
|
std::vector<common_adapter_lora_info> lora;
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
std::vector<std::string> response_fields;
|
std::vector<std::string> response_fields;
|
||||||
@ -198,7 +198,7 @@ struct server_task {
|
|||||||
bool metrics_reset_bucket = false;
|
bool metrics_reset_bucket = false;
|
||||||
|
|
||||||
// used by SERVER_TASK_TYPE_SET_LORA
|
// used by SERVER_TASK_TYPE_SET_LORA
|
||||||
std::vector<common_lora_adapter_info> set_lora;
|
std::vector<common_adapter_lora_info> set_lora;
|
||||||
|
|
||||||
server_task(server_task_type type) : type(type) {}
|
server_task(server_task_type type) : type(type) {}
|
||||||
|
|
||||||
@ -1133,7 +1133,7 @@ struct server_slot {
|
|||||||
|
|
||||||
common_speculative * spec = nullptr;
|
common_speculative * spec = nullptr;
|
||||||
|
|
||||||
std::vector<common_lora_adapter_info> lora;
|
std::vector<common_adapter_lora_info> lora;
|
||||||
|
|
||||||
// the index relative to completion multi-task request
|
// the index relative to completion multi-task request
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
@ -2934,7 +2934,7 @@ struct server_context {
|
|||||||
// make sure we're in the right embedding mode
|
// make sure we're in the right embedding mode
|
||||||
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
llama_set_embeddings(ctx, slot_batched->is_non_causal());
|
||||||
// apply lora, only need to do it once per batch
|
// apply lora, only need to do it once per batch
|
||||||
common_lora_adapters_apply(ctx, slot_batched->lora);
|
common_set_adapter_lora(ctx, slot_batched->lora);
|
||||||
}
|
}
|
||||||
|
|
||||||
// process the created batch of tokens
|
// process the created batch of tokens
|
||||||
|
@ -804,8 +804,8 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
|
|||||||
}
|
}
|
||||||
|
|
||||||
static bool are_lora_equal(
|
static bool are_lora_equal(
|
||||||
const std::vector<common_lora_adapter_info> & l1,
|
const std::vector<common_adapter_lora_info> & l1,
|
||||||
const std::vector<common_lora_adapter_info> & l2) {
|
const std::vector<common_adapter_lora_info> & l2) {
|
||||||
if (l1.size() != l2.size()) {
|
if (l1.size() != l2.size()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -819,10 +819,10 @@ static bool are_lora_equal(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// parse lora config from JSON request, returned a copy of lora_base with updated scale
|
// parse lora config from JSON request, returned a copy of lora_base with updated scale
|
||||||
static std::vector<common_lora_adapter_info> parse_lora_request(
|
static std::vector<common_adapter_lora_info> parse_lora_request(
|
||||||
const std::vector<common_lora_adapter_info> & lora_base,
|
const std::vector<common_adapter_lora_info> & lora_base,
|
||||||
const json & data) {
|
const json & data) {
|
||||||
std::vector<common_lora_adapter_info> lora(lora_base);
|
std::vector<common_adapter_lora_info> lora(lora_base);
|
||||||
int max_idx = lora.size();
|
int max_idx = lora.size();
|
||||||
|
|
||||||
// clear existing value
|
// clear existing value
|
||||||
|
@ -20,11 +20,11 @@ struct llama_sampler_deleter {
|
|||||||
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_lora_adapter_deleter {
|
struct llama_adapter_lora_deleter {
|
||||||
void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
|
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||||
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
||||||
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
||||||
typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
|
typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
|
||||||
|
@ -385,8 +385,7 @@ extern "C" {
|
|||||||
} llama_chat_message;
|
} llama_chat_message;
|
||||||
|
|
||||||
// lora adapter
|
// lora adapter
|
||||||
// TODO: rename to llama_adapter_lora
|
struct llama_adapter_lora;
|
||||||
struct llama_lora_adapter;
|
|
||||||
|
|
||||||
// Helpers for getting default parameters
|
// Helpers for getting default parameters
|
||||||
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
||||||
@ -520,34 +519,31 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
|
|
||||||
// Load a LoRA adapter from file
|
// Load a LoRA adapter from file
|
||||||
// TODO: rename to llama_adapter_lora_init
|
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||||
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
const char * path_lora);
|
const char * path_lora);
|
||||||
|
|
||||||
|
// Manually free a LoRA adapter
|
||||||
|
// Note: loaded adapters will be free when the associated model is deleted
|
||||||
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||||
|
|
||||||
|
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
||||||
|
|
||||||
// Add a loaded LoRA adapter to given context
|
// Add a loaded LoRA adapter to given context
|
||||||
// This will not modify model's weight
|
// This will not modify model's weight
|
||||||
// TODO: rename to llama_set_adapter_lora
|
LLAMA_API int32_t llama_set_adapter_lora(
|
||||||
LLAMA_API int32_t llama_lora_adapter_set(
|
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_lora_adapter * adapter,
|
struct llama_adapter_lora * adapter,
|
||||||
float scale);
|
float scale);
|
||||||
|
|
||||||
// Remove a specific LoRA adapter from given context
|
// Remove a specific LoRA adapter from given context
|
||||||
// Return -1 if the adapter is not present in the context
|
// Return -1 if the adapter is not present in the context
|
||||||
// TODO: rename to llama_rm_adapter_lora
|
LLAMA_API int32_t llama_rm_adapter_lora(
|
||||||
LLAMA_API int32_t llama_lora_adapter_remove(
|
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_lora_adapter * adapter);
|
struct llama_adapter_lora * adapter);
|
||||||
|
|
||||||
// Remove all LoRA adapters from given context
|
// Remove all LoRA adapters from given context
|
||||||
// TODO: rename to llama_clear_adapter_lora
|
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_lora_adapter_clear(struct llama_context * ctx);
|
|
||||||
|
|
||||||
// Manually free a LoRA adapter
|
|
||||||
// Note: loaded adapters will be free when the associated model is deleted
|
|
||||||
// TODO: rename to llama_adapter_lora_free
|
|
||||||
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
|
||||||
|
|
||||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
// the currently loaded vector.
|
// the currently loaded vector.
|
||||||
@ -555,9 +551,8 @@ extern "C" {
|
|||||||
// to an n_embd x n_layers buffer starting from layer 1.
|
// to an n_embd x n_layers buffer starting from layer 1.
|
||||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
// See llama_control_vector_load in common to load a control vector.
|
// See llama_control_vector_load in common to load a control vector.
|
||||||
// TODO: rename to llama_adapter_cvec_apply
|
LLAMA_API int32_t llama_apply_adapter_cvec(
|
||||||
LLAMA_API int32_t llama_control_vector_apply(
|
struct llama_context * ctx,
|
||||||
struct llama_context * lctx,
|
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
int32_t n_embd,
|
int32_t n_embd,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "llama-adapter.h"
|
#include "llama-adapter.h"
|
||||||
|
|
||||||
|
#include "llama-impl.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
@ -10,7 +11,7 @@
|
|||||||
|
|
||||||
// vec
|
// vec
|
||||||
|
|
||||||
struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
|
||||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
@ -18,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
|
|||||||
return tensors[il];
|
return tensors[il];
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
|
||||||
ggml_tensor * layer_dir = tensor_for(il);
|
ggml_tensor * layer_dir = tensor_for(il);
|
||||||
if (layer_dir != nullptr) {
|
if (layer_dir != nullptr) {
|
||||||
cur = ggml_add(ctx, cur, layer_dir);
|
cur = ggml_add(ctx, cur, layer_dir);
|
||||||
@ -27,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
|
|||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
bool llama_adapter_cvec::init(const llama_model & model) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
GGML_ASSERT(cvec.tensors.empty());
|
GGML_ASSERT(tensors.empty());
|
||||||
GGML_ASSERT(cvec.ctxs.empty());
|
GGML_ASSERT(ctxs.empty());
|
||||||
GGML_ASSERT(cvec.bufs.empty());
|
GGML_ASSERT(bufs.empty());
|
||||||
|
|
||||||
// create a context for each buffer type
|
// create a context for each buffer type
|
||||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
@ -51,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
ctx_map[buft] = ctx;
|
ctx_map[buft] = ctx;
|
||||||
cvec.ctxs.emplace_back(ctx);
|
ctxs.emplace_back(ctx);
|
||||||
|
|
||||||
return ctx;
|
return ctx;
|
||||||
}
|
}
|
||||||
@ -60,8 +61,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
};
|
};
|
||||||
|
|
||||||
// make tensors
|
// make tensors
|
||||||
cvec.tensors.reserve(hparams.n_layer);
|
tensors.reserve(hparams.n_layer);
|
||||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||||
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
ggml_backend_buffer_type_t buft = model.select_buft(il);
|
||||||
ggml_context * ctx = ctx_for_buft(buft);
|
ggml_context * ctx = ctx_for_buft(buft);
|
||||||
@ -70,11 +71,11 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
||||||
cvec.tensors.push_back(tensor);
|
tensors.push_back(tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate tensors / buffers and zero
|
// allocate tensors / buffers and zero
|
||||||
cvec.bufs.reserve(ctx_map.size());
|
bufs.reserve(ctx_map.size());
|
||||||
for (auto it : ctx_map) {
|
for (auto it : ctx_map) {
|
||||||
ggml_backend_buffer_type_t buft = it.first;
|
ggml_backend_buffer_type_t buft = it.first;
|
||||||
ggml_context * ctx = it.second;
|
ggml_context * ctx = it.second;
|
||||||
@ -84,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buf, 0);
|
ggml_backend_buffer_clear(buf, 0);
|
||||||
cvec.bufs.emplace_back(buf);
|
bufs.emplace_back(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_control_vector_apply(
|
int32_t llama_adapter_cvec::apply(
|
||||||
struct llama_control_vector & cvec,
|
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
@ -102,8 +102,8 @@ int32_t llama_control_vector_apply(
|
|||||||
|
|
||||||
if (data == nullptr) {
|
if (data == nullptr) {
|
||||||
// disable the current control vector (but leave allocated for later)
|
// disable the current control vector (but leave allocated for later)
|
||||||
cvec.layer_start = -1;
|
layer_start = -1;
|
||||||
cvec.layer_end = -1;
|
layer_end = -1;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,21 +112,21 @@ int32_t llama_control_vector_apply(
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cvec.tensors.empty()) {
|
if (tensors.empty()) {
|
||||||
if (!llama_control_vector_init(cvec, model)) {
|
if (!init(model)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cvec.layer_start = il_start;
|
layer_start = il_start;
|
||||||
cvec.layer_end = il_end;
|
layer_end = il_end;
|
||||||
|
|
||||||
for (size_t il = 1; il < hparams.n_layer; il++) {
|
for (size_t il = 1; il < hparams.n_layer; il++) {
|
||||||
assert(cvec.tensors[il] != nullptr);
|
assert(tensors[il] != nullptr);
|
||||||
|
|
||||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||||
if (off + n_embd <= len) {
|
if (off + n_embd <= len) {
|
||||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,7 +135,7 @@ int32_t llama_control_vector_apply(
|
|||||||
|
|
||||||
// lora
|
// lora
|
||||||
|
|
||||||
llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
|
||||||
const std::string name(w->name);
|
const std::string name(w->name);
|
||||||
|
|
||||||
const auto pos = ab_map.find(name);
|
const auto pos = ab_map.find(name);
|
||||||
@ -146,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
|
||||||
delete adapter;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
|
|
||||||
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
|
||||||
|
|
||||||
ggml_context * ctx_init;
|
ggml_context * ctx_init;
|
||||||
@ -222,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
};
|
};
|
||||||
|
|
||||||
// bundle lora_a and lora_b into pairs
|
// bundle lora_a and lora_b into pairs
|
||||||
std::map<std::string, llama_lora_weight> ab_map;
|
std::map<std::string, llama_adapter_lora_weight> ab_map;
|
||||||
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
auto str_endswith = [](const std::string & str, const std::string & suffix) {
|
||||||
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
||||||
};
|
};
|
||||||
@ -232,14 +228,14 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
if (str_endswith(name, ".lora_a")) {
|
if (str_endswith(name, ".lora_a")) {
|
||||||
replace_all(name, ".lora_a", "");
|
replace_all(name, ".lora_a", "");
|
||||||
if (ab_map.find(name) == ab_map.end()) {
|
if (ab_map.find(name) == ab_map.end()) {
|
||||||
ab_map[name] = llama_lora_weight(cur, nullptr);
|
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
|
||||||
} else {
|
} else {
|
||||||
ab_map[name].a = cur;
|
ab_map[name].a = cur;
|
||||||
}
|
}
|
||||||
} else if (str_endswith(name, ".lora_b")) {
|
} else if (str_endswith(name, ".lora_b")) {
|
||||||
replace_all(name, ".lora_b", "");
|
replace_all(name, ".lora_b", "");
|
||||||
if (ab_map.find(name) == ab_map.end()) {
|
if (ab_map.find(name) == ab_map.end()) {
|
||||||
ab_map[name] = llama_lora_weight(nullptr, cur);
|
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
|
||||||
} else {
|
} else {
|
||||||
ab_map[name].b = cur;
|
ab_map[name].b = cur;
|
||||||
}
|
}
|
||||||
@ -255,7 +251,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
// add tensors
|
// add tensors
|
||||||
for (auto & it : ab_map) {
|
for (auto & it : ab_map) {
|
||||||
const std::string & name = it.first;
|
const std::string & name = it.first;
|
||||||
llama_lora_weight & w = it.second;
|
llama_adapter_lora_weight & w = it.second;
|
||||||
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
bool is_token_embd = str_endswith(name, "token_embd.weight");
|
||||||
|
|
||||||
if (!w.a || !w.b) {
|
if (!w.a || !w.b) {
|
||||||
@ -289,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
|
||||||
ggml_set_name(tensor_a, w.a->name);
|
ggml_set_name(tensor_a, w.a->name);
|
||||||
ggml_set_name(tensor_b, w.b->name);
|
ggml_set_name(tensor_b, w.b->name);
|
||||||
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
|
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate tensors / buffers and zero
|
// allocate tensors / buffers and zero
|
||||||
@ -331,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
|
|||||||
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
|
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
|
||||||
struct llama_lora_adapter * adapter = new llama_lora_adapter();
|
struct llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
|
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||||
return adapter;
|
return adapter;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
@ -345,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,
|
|||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
|
||||||
|
delete adapter;
|
||||||
|
}
|
||||||
|
@ -1,34 +1,25 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llama-impl.h"
|
#include "llama.h"
|
||||||
#include "llama-hparams.h"
|
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
// TODO: pimpl
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_adapter_cvec
|
// llama_adapter_cvec
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: rename to llama_adapter_cvec
|
struct llama_adapter_cvec {
|
||||||
struct llama_control_vector {
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
||||||
|
|
||||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
|
||||||
|
|
||||||
int32_t layer_start = -1;
|
|
||||||
int32_t layer_end = -1;
|
|
||||||
|
|
||||||
struct ggml_tensor * tensor_for(int il) const;
|
struct ggml_tensor * tensor_for(int il) const;
|
||||||
|
|
||||||
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
|
||||||
};
|
|
||||||
|
|
||||||
int32_t llama_control_vector_apply(
|
int32_t apply(
|
||||||
struct llama_control_vector & cvec,
|
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
@ -36,38 +27,48 @@ int32_t llama_control_vector_apply(
|
|||||||
int32_t il_start,
|
int32_t il_start,
|
||||||
int32_t il_end);
|
int32_t il_end);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool init(const llama_model & model);
|
||||||
|
|
||||||
|
int32_t layer_start = -1;
|
||||||
|
int32_t layer_end = -1;
|
||||||
|
|
||||||
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
|
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||||
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// llama_adapter_lora
|
// llama_adapter_lora
|
||||||
//
|
//
|
||||||
|
|
||||||
// TODO: rename to llama_adapter_lora_weight
|
struct llama_adapter_lora_weight {
|
||||||
struct llama_lora_weight {
|
|
||||||
struct ggml_tensor * a = nullptr;
|
struct ggml_tensor * a = nullptr;
|
||||||
struct ggml_tensor * b = nullptr;
|
struct ggml_tensor * b = nullptr;
|
||||||
|
|
||||||
// get actual scale based on rank and alpha
|
// get actual scale based on rank and alpha
|
||||||
float get_scale(float alpha, float adapter_scale) {
|
float get_scale(float alpha, float adapter_scale) const {
|
||||||
const float rank = (float) b->ne[0];
|
const float rank = (float) b->ne[0];
|
||||||
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
|
||||||
return scale;
|
return scale;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_lora_weight() = default;
|
llama_adapter_lora_weight() = default;
|
||||||
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: rename to llama_adapter_lora
|
struct llama_adapter_lora {
|
||||||
struct llama_lora_adapter {
|
|
||||||
// map tensor name to lora_a_b
|
// map tensor name to lora_a_b
|
||||||
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
|
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
float alpha;
|
float alpha;
|
||||||
|
|
||||||
llama_lora_adapter() = default;
|
llama_adapter_lora() = default;
|
||||||
~llama_lora_adapter() = default;
|
~llama_adapter_lora() = default;
|
||||||
|
|
||||||
llama_lora_weight * get_weight(struct ggml_tensor * w);
|
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
|
||||||
};
|
};
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
|
|
||||||
|
#include "llama-impl.h"
|
||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -25,9 +25,9 @@ struct llama_context {
|
|||||||
struct llama_cparams cparams;
|
struct llama_cparams cparams;
|
||||||
struct llama_sbatch sbatch; // TODO: revisit if needed
|
struct llama_sbatch sbatch; // TODO: revisit if needed
|
||||||
struct llama_kv_cache kv_self;
|
struct llama_kv_cache kv_self;
|
||||||
struct llama_control_vector cvec;
|
struct llama_adapter_cvec cvec;
|
||||||
|
|
||||||
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
|
std::unordered_map<struct llama_adapter_lora *, float> lora;
|
||||||
|
|
||||||
std::vector<ggml_backend_ptr> backends;
|
std::vector<ggml_backend_ptr> backends;
|
||||||
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
|
||||||
|
@ -125,16 +125,16 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|||||||
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
||||||
|
|
||||||
// apply lora for embedding tokens if needed
|
// apply lora for embedding tokens if needed
|
||||||
for (auto & it : lctx.lora_adapters) {
|
for (auto & it : lctx.lora) {
|
||||||
struct llama_lora_weight * lora = it.first->get_weight(tok_embd);
|
struct llama_adapter_lora_weight * lw = it.first->get_weight(tok_embd);
|
||||||
if (lora == nullptr) {
|
if (lw == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const float adapter_scale = it.second;
|
const float adapter_scale = it.second;
|
||||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
const float scale = lw->get_scale(it.first->alpha, adapter_scale);
|
||||||
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
struct ggml_tensor * inpL_delta = ggml_scale(ctx, ggml_mul_mat(
|
||||||
ctx, lora->b, // non-transposed lora_b
|
ctx, lw->b, // non-transposed lora_b
|
||||||
ggml_get_rows(ctx, lora->a, lctx.inp_tokens)
|
ggml_get_rows(ctx, lw->a, lctx.inp_tokens)
|
||||||
), scale);
|
), scale);
|
||||||
inpL = ggml_add(ctx, inpL, inpL_delta);
|
inpL = ggml_add(ctx, inpL, inpL_delta);
|
||||||
}
|
}
|
||||||
@ -205,16 +205,16 @@ static struct ggml_tensor * llm_build_lora_mm(
|
|||||||
struct ggml_tensor * w,
|
struct ggml_tensor * w,
|
||||||
struct ggml_tensor * cur) {
|
struct ggml_tensor * cur) {
|
||||||
struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
|
struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
|
||||||
for (auto & it : lctx.lora_adapters) {
|
for (auto & it : lctx.lora) {
|
||||||
struct llama_lora_weight * lora = it.first->get_weight(w);
|
struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
|
||||||
if (lora == nullptr) {
|
if (lw == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const float adapter_scale = it.second;
|
const float adapter_scale = it.second;
|
||||||
const float scale = lora->get_scale(it.first->alpha, adapter_scale);
|
const float scale = lw->get_scale(it.first->alpha, adapter_scale);
|
||||||
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
struct ggml_tensor * ab_cur = ggml_mul_mat(
|
||||||
ctx0, lora->b,
|
ctx0, lw->b,
|
||||||
ggml_mul_mat(ctx0, lora->a, cur)
|
ggml_mul_mat(ctx0, lw->a, cur)
|
||||||
);
|
);
|
||||||
ab_cur = ggml_scale(ctx0, ab_cur, scale);
|
ab_cur = ggml_scale(ctx0, ab_cur, scale);
|
||||||
res = ggml_add(ctx0, res, ab_cur);
|
res = ggml_add(ctx0, res, ab_cur);
|
||||||
@ -230,17 +230,17 @@ static struct ggml_tensor * llm_build_lora_mm_id(
|
|||||||
struct ggml_tensor * cur, // struct ggml_tensor * b
|
struct ggml_tensor * cur, // struct ggml_tensor * b
|
||||||
struct ggml_tensor * ids) {
|
struct ggml_tensor * ids) {
|
||||||
struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
|
struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
|
||||||
for (auto & it : lctx.lora_adapters) {
|
for (auto & it : lctx.lora) {
|
||||||
struct llama_lora_weight * lora = it.first->get_weight(w);
|
struct llama_adapter_lora_weight * lw = it.first->get_weight(w);
|
||||||
if (lora == nullptr) {
|
if (lw == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const float alpha = it.first->alpha;
|
const float alpha = it.first->alpha;
|
||||||
const float rank = (float) lora->b->ne[0];
|
const float rank = (float) lw->b->ne[0];
|
||||||
const float scale = alpha ? it.second * alpha / rank : it.second;
|
const float scale = alpha ? it.second * alpha / rank : it.second;
|
||||||
struct ggml_tensor * ab_cur = ggml_mul_mat_id(
|
struct ggml_tensor * ab_cur = ggml_mul_mat_id(
|
||||||
ctx0, lora->b,
|
ctx0, lw->b,
|
||||||
ggml_mul_mat_id(ctx0, lora->a, cur, ids),
|
ggml_mul_mat_id(ctx0, lw->a, cur, ids),
|
||||||
ids
|
ids
|
||||||
);
|
);
|
||||||
ab_cur = ggml_scale(ctx0, ab_cur, scale);
|
ab_cur = ggml_scale(ctx0, ab_cur, scale);
|
||||||
@ -9243,39 +9243,38 @@ static void llama_kv_cache_update_impl(struct llama_context & lctx) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_lora_adapter_set(
|
int32_t llama_set_adapter_lora(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_lora_adapter * adapter,
|
struct llama_adapter_lora * adapter,
|
||||||
float scale) {
|
float scale) {
|
||||||
ctx->lora_adapters[adapter] = scale;
|
ctx->lora[adapter] = scale;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t llama_lora_adapter_remove(
|
int32_t llama_rm_adapter_lora(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_lora_adapter * adapter) {
|
struct llama_adapter_lora * adapter) {
|
||||||
auto pos = ctx->lora_adapters.find(adapter);
|
auto pos = ctx->lora.find(adapter);
|
||||||
if (pos != ctx->lora_adapters.end()) {
|
if (pos != ctx->lora.end()) {
|
||||||
ctx->lora_adapters.erase(pos);
|
ctx->lora.erase(pos);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_lora_adapter_clear(struct llama_context * ctx) {
|
void llama_clear_adapter_lora(struct llama_context * ctx) {
|
||||||
ctx->lora_adapters.clear();
|
ctx->lora.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: tmp
|
int32_t llama_apply_adapter_cvec(
|
||||||
int32_t llama_control_vector_apply(
|
struct llama_context * ctx,
|
||||||
struct llama_context * lctx,
|
|
||||||
const float * data,
|
const float * data,
|
||||||
size_t len,
|
size_t len,
|
||||||
int32_t n_embd,
|
int32_t n_embd,
|
||||||
int32_t il_start,
|
int32_t il_start,
|
||||||
int32_t il_end) {
|
int32_t il_end) {
|
||||||
return llama_control_vector_apply(lctx->cvec, lctx->model, data, len, n_embd, il_start, il_end);
|
return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user