From 67c5e14d069fba61a424f6d782de2d49bf2a8722 Mon Sep 17 00:00:00 2001 From: ngxson Date: Sat, 6 Jul 2024 02:12:53 +0200 Subject: [PATCH] lora: load to devide buft --- common/common.cpp | 10 +- include/llama.h | 13 +- src/llama.cpp | 423 ++++++++++++++++++---------------------------- 3 files changed, 172 insertions(+), 274 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c548bcb28..d3eec6aa7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2063,14 +2063,8 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); - int err = llama_model_apply_lora_from_file(model, - lora_adapter.c_str(), - lora_scale, - ((i > 0) || params.lora_base.empty()) - ? NULL - : params.lora_base.c_str(), - params.n_threads); - if (err != 0) { + auto adapter = llama_lora_adapter_init(lctx, lora_adapter.c_str()); + if (adapter == nullptr) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); llama_free(lctx); llama_free_model(model); diff --git a/include/llama.h b/include/llama.h index 865ace994..077d90283 100644 --- a/include/llama.h +++ b/include/llama.h @@ -406,6 +406,9 @@ extern "C" { const char * content; } llama_chat_message; + // lora adapter + struct llama_lora_adapter; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -510,13 +513,9 @@ extern "C" { // the layers modified by the adapter. Can be NULL to use the current loaded model. // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one - // Returns 0 on success - LLAMA_API int32_t llama_model_apply_lora_from_file( - const struct llama_model * model, - const char * path_lora, - float scale, - const char * path_base_model, - int32_t n_threads); + LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( + struct llama_context * ctx, + const char * path_lora); // Apply a loaded control vector to a llama_context, or if data is NULL, clear // the currently loaded vector. diff --git a/src/llama.cpp b/src/llama.cpp index b770ca5bc..ec89b2778 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,6 +2547,29 @@ struct llama_control_vector { } }; +struct lora_weight { + struct ggml_tensor * a = nullptr; + struct ggml_tensor * b = nullptr; + lora_weight() {} + lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {} +}; + +struct llama_lora_adapter { + // map tensor name to lora_a_b + std::map ab_map; + std::vector ctxs; + std::vector bufs; + + ~llama_lora_adapter() { + for (struct ggml_context * ctx : ctxs) { + ggml_free(ctx); + } + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } + } +}; + struct llama_vocab { using id = int32_t; using token = std::string; @@ -2704,6 +2727,10 @@ struct llama_context { } ggml_backend_buffer_free(buf_output); + + for (auto adapter : lora_adapters) { + delete adapter; + } } llama_cparams cparams; @@ -2795,6 +2822,9 @@ struct llama_context { // control vectors struct llama_control_vector cvec; + + // lora adapters + std::vector lora_adapters; }; static size_t llama_get_device_count(const llama_model & model) { @@ -18243,281 +18273,149 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } -static int llama_apply_lora_from_file_internal( - const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads -) { +static int llama_lora_adapter_init_internal(const struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) { + static const int n_inp_tensors = 5; // see llama_model + static const int n_out_tensors = 5; // see llama_model LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - const int64_t t_start_lora_us = ggml_time_us(); + // TODO: check lora base model arch - llama_file fin(path_lora, "rb"); - - // verify magic and version - { - uint32_t magic = fin.read_u32(); - if (magic != LLAMA_FILE_MAGIC_GGLA) { - LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); - return 1; - } - - uint32_t format_version = fin.read_u32(); - if (format_version != 1) { - LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); - return 1; - } - } - - int32_t lora_r = fin.read_u32(); - int32_t lora_alpha = fin.read_u32(); - float scaling = scale * (float)lora_alpha / (float)lora_r; - - LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - - // load base model - std::unique_ptr ml; - if (path_base_model) { - LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); - ml->init_mappings(/*prefetch*/ false); // no prefetching - } - - struct tensor_meta { - std::string name; - ggml_type type; - int32_t ne[2]; - size_t offset; + ggml_context * ctx = nullptr; + struct gguf_init_params meta_gguf_params = { + /* .no_alloc = */ false, + /* .ctx = */ &ctx, }; - std::map tensor_meta_map; - - // load all tensor meta - while (true) { - if (fin.tell() == fin.size) { - // eof - break; - } - - int32_t n_dims; - int32_t name_len; - int32_t ftype; - - fin.read_raw(&n_dims, sizeof(n_dims)); - fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); - - if (n_dims != 1 && n_dims != 2) { - LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); - return 1; - } - - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read_raw(&ne[i], sizeof(ne[i])); - } - - std::string name; - { - GGML_ASSERT(name_len < GGML_MAX_NAME); - char buf[GGML_MAX_NAME]; - fin.read_raw(buf, name_len); - name = std::string(buf, name_len); - } - - // check for lora suffix - std::string lora_suffix; - if (name.length() > 6) { - lora_suffix = name.substr(name.length() - 6); - } - if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { - LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); - return 1; - } - - // tensor type - ggml_type wtype; - switch (ftype) { - case 0: wtype = GGML_TYPE_F32; break; - case 1: wtype = GGML_TYPE_F16; break; - default: - { - LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", - __func__, ftype); - return 1; - } - } - - // data offset - size_t offset = fin.tell(); - offset = (offset + 31) & -32; - - // skip tensor data - fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - - tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params); + if (!ctx_gguf) { + LLAMA_LOG_ERROR("%s: failed to load lora adapter file from %s\n", __func__, path_lora); + return -1; } - bool warned = false; - int n_tensors = 0; - - // apply - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); - if (backend_cpu == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); - return 1; - } - ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); - - std::vector> read_buf; - for (const auto & it : model.tensors_by_name) { - const std::string & base_name = it.first; - ggml_tensor * model_t = it.second; - - if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || - tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { - continue; + // calculate n_tensors_per_layer + int n_tensors_per_layer = 0; + { + int32_t n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + int il = -1; + sscanf(gguf_get_tensor_name(ctx_gguf, i), "blk.%d.", &il); + if (il == 0) n_tensors_per_layer++; } + } + printf("n_tensors_per_layer %d\n", n_tensors_per_layer); - tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); - tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); + // count layer buffer types + std::map buft_layer_count; + for (int64_t i = 0; i < model.hparams.n_layer; i++) { + buft_layer_count[model.buft_layer[i].buft]++; + } - ggml_init_params lora_init_params = { - /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), - /* .mem_buffer */ nullptr, - /* .no_alloc */ true, + // allocate contexts + std::map ctx_map; + { + auto new_ggml_ctx = [](size_t n_tensors) { + struct ggml_init_params params = { + /*.mem_size =*/ n_tensors*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + return ggml_init(params); }; - ggml_context * lora_ctx = ggml_init(lora_init_params); - if (lora_ctx == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); - ggml_backend_free(backend_cpu); - return 1; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + printf("buf %p layers %d\n", it.first, it.second); + ctx_map[it.first] = new_ggml_ctx(2*n_layers*n_tensors_per_layer); } + //ctx_map[model.buft_input.buft] = new_ggml_ctx(2*n_inp_tensors); + //ctx_map[model.buft_output.buft] = new_ggml_ctx(2*n_out_tensors); + } - // create tensors - ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); - ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); - ggml_set_name(loraA, metaA.name.c_str()); - ggml_set_name(loraB, metaB.name.c_str()); - - ggml_tensor * base_t; - if (ml) { - if (!ml->get_tensor_meta(base_name.c_str())) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; + // bundle lora_a and lora_b into pairs + std::map ab_map; + auto str_endswith = [](const std::string & str, const std::string & suffix) { + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; + }; + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string name(cur->name); + if (str_endswith(name, ".lora_a")) { + replace_all(name, ".lora_a", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(cur, nullptr); + } else { + ab_map[name].a = cur; } - base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); + } else if (str_endswith(name, ".lora_b")) { + replace_all(name, ".lora_b", ""); + if (ab_map.find(name) == ab_map.end()) { + ab_map[name] = lora_weight(nullptr, cur); + } else { + ab_map[name].b = cur; + } + } + } + + // add tensors + for (auto & it : ab_map) { + std::string name = it.first; + lora_weight & w = it.second; + GGML_ASSERT(w.a != nullptr); + GGML_ASSERT(w.b != nullptr); + int il = -1; + sscanf(name.c_str(), "blk.%d.", &il); + if (il >= 0) { + printf("%s %p %p\n", name.c_str(), w.a, w.b); + struct ggml_context * dev_ctx = ctx_map.at(model.buft_layer[il].buft); + struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a); + struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b); + ggml_set_name(tensor_a, w.a->name); + ggml_set_name(tensor_b, w.b->name); + adapter.ab_map[name] = lora_weight(tensor_a, tensor_b); } else { - base_t = ggml_dup_tensor(lora_ctx, model_t); - } - ggml_set_name(base_t, base_name.c_str()); - - // allocate in backend buffer - ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (lora_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); - return 1; - } - - // load tensor data - auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { - read_buf.resize(ggml_nbytes(tensor)); - fin.seek(tensor_meta.offset, SEEK_SET); - fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); - }; - load_tensor(metaA, loraA); - load_tensor(metaB, loraB); - - // load base model tensor data - if (ml) { - ml->load_data_for(base_t); - } else { - ggml_backend_tensor_copy(model_t, base_t); - } - - if (ggml_is_quantized(base_t->type) && !warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - auto build_lora_graph = [&]() { - // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); - ggml_set_name(BA, "BA"); - - if (scaling != 1.0f) { - BA = ggml_scale(lora_ctx, BA, scaling); - ggml_set_name(BA, "BA_scaled"); - } - - ggml_tensor * r; - r = ggml_add_inplace(lora_ctx, base_t, BA); - ggml_set_name(r, "r_add"); - - if (base_t->type != model_t->type) { - // convert the result to the model type - r = ggml_cast(lora_ctx, r, model_t->type); - ggml_set_name(r, "r_cast"); - } - - return r; - }; - - ggml_cgraph * gf = ggml_new_graph(lora_ctx); - ggml_tensor * r = build_lora_graph(); - ggml_build_forward_expand(gf, r); - - ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); - if (graph_buf == nullptr) { - LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); - ggml_free(lora_ctx); - ggml_backend_buffer_free(lora_buf); - ggml_backend_free(backend_cpu); - return 1; - } - - ggml_backend_graph_compute(backend_cpu, gf); - - ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); - -#if 0 - // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU - //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); - - // sched compute - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_init_measure(sched, gf); - - // create the graph again, since the previous one was destroyed by the measure - ggml_graph_clear(gf); - ggml_build_forward_expand(gf, build_graph()); - ggml_backend_sched_graph_compute(sched, gf); - ggml_backend_sched_free(sched); -#endif - - ggml_backend_buffer_free(lora_buf); - ggml_backend_buffer_free(graph_buf); - ggml_free(lora_ctx); - - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); + // TODO: process output & token_embd tensors } } - ggml_backend_free(backend_cpu); + // allocate tensors / buffers and zero + { + adapter.ctxs.reserve(ctx_map.size()); + adapter.bufs.reserve(ctx_map.size()); + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora adapter\n", __func__); + return -1; + } + ggml_backend_buffer_clear(buf, 0); + adapter.ctxs.push_back(ctx); + adapter.bufs.push_back(buf); + } + } - const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; - LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); + // set tensor data + { + llama_file gguf_file(path_lora, "rb"); + std::vector read_buf; + auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) { + size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name)); + size_t size = ggml_nbytes(orig); + if (read_buf.size() < size) { + read_buf.resize(size); + } + gguf_file.read_raw(read_buf.data(), size); + printf("%s: %s size=%ld\n", __func__, orig->name, size); + return ggml_backend_tensor_set(dev, read_buf.data(), 0, size); + }; + for (auto & it : adapter.ab_map) { + auto orig = ab_map[it.first]; + auto dev = it.second; + set_tensor(orig.a, dev.a); + set_tensor(orig.b, dev.b); + } + } + // free ctx for reading gguf + ggml_free(ctx); return 0; } @@ -19298,12 +19196,19 @@ uint32_t llama_model_quantize( } } -int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { +struct llama_lora_adapter * llama_lora_adapter_init(struct llama_context * ctx, const char * path_lora) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); + struct llama_lora_adapter * adapter = new llama_lora_adapter; + int res = llama_lora_adapter_init_internal(ctx->model, path_lora, *adapter); + if (res == 0) { + ctx->lora_adapters.push_back(adapter); + return adapter; + } else { + return nullptr; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); - return 1; + return nullptr; } }