From e08f8a5d8c327efec5369aade6a7d92c4ccb2f1b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 10 Dec 2024 18:23:10 +0200 Subject: [PATCH] extract features --- examples/tts/convert_pt_to_hf.py | 1 + examples/tts/tts.cpp | 15 ++++- src/llama.cpp | 108 +++++++++++++++++++++++-------- 3 files changed, 93 insertions(+), 31 deletions(-) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py index a652bae43..389d2de50 100644 --- a/examples/tts/convert_pt_to_hf.py +++ b/examples/tts/convert_pt_to_hf.py @@ -138,6 +138,7 @@ config = { ], "hidden_size": 512, "vocab_size": 4096, + "n_head": 1, "max_position_embeddings": 8192, # ? "num_hidden_layers": 12 } diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 768015a52..d3fee7373 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -88,6 +88,7 @@ int main(int argc, char ** argv) { ctx_ttc = llama_init_ttc.context; params.model = params.vocoder.model; + params.embedding = true; common_init_result llama_init_cts = common_init_from_params(params); model_cts = llama_init_cts.model; @@ -146,6 +147,9 @@ int main(int argc, char ** argv) { LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size()); } + for (auto & token : prompt_inp) { + token -= 151672; + } llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1); @@ -155,22 +159,27 @@ int main(int argc, char ** argv) { } GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); - if (llama_decode(ctx_ttc, batch) != 0) { + if (llama_decode(ctx_cts, batch) != 0) { LOG_ERR("%s: llama_decode() failed\n", __func__); return 1; } - llama_synchronize(ctx_ttc); + llama_synchronize(ctx_cts); LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); - const float * embd = llama_get_embeddings(ctx_ttc); + const float * embd = llama_get_embeddings(ctx_cts); LOG("result:\n"); for (int i = 0; i < 10; ++i) { LOG("%8.3f ", embd[i]); } LOG("\n"); + double sum = 0.0; + for (int i = 0; i < 261*512; ++i) { + sum += embd[i]; + } + LOG("sum: %f\n", sum); fprintf(stderr, "\n"); diff --git a/src/llama.cpp b/src/llama.cpp index 2ab5322c3..4fc676a1e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3614,7 +3614,9 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams = model.hparams; - const int64_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer; + + LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); cache.has_shift = false; @@ -3655,10 +3657,12 @@ static bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - for (int i = 0; i < (int) n_layer; i++) { + for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa); + ggml_backend_buffer_type_t buft; if (offload) { auto * dev = model.dev_layer.at(i).dev; @@ -5032,7 +5036,8 @@ struct llama_model_loader { void done_getting_tensors() const { if (n_created != n_tensors) { - throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); + // TODO: TEMPORARY DISABLED + //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } } @@ -9422,6 +9427,10 @@ static bool llm_load_tensors( case LLM_ARCH_OUTETTS_VOC: { model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED); } break; default: throw std::runtime_error("unknown architecture"); @@ -16991,6 +17000,30 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_outetts_voc() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + cur = inpL; + + //cur = llm_build_norm(ctx0, cur, hparams, + // model.output_norm, NULL, + // LLM_NORM_RMS, cb, -1); + //cb(cur, "result_norm", -1); + + //// lm_head + //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + //cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -17266,13 +17299,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_chameleon(); } break; + case LLM_ARCH_OUTETTS_VOC: + { + result = llm.build_outetts_voc(); + } break; default: GGML_ABORT("fatal error"); } // add on pooling layer if (lctx.cparams.embeddings) { - result = llm.append_pooling(result); + // TODO: TEMPORARY DISABLED + //result = llm.append_pooling(result); } llm.free(); @@ -17357,30 +17395,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - const int64_t n_tokens = ubatch.n_tokens; + //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; - - if (lctx.n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; + if (!lctx.inp_out_ids) { + LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); } else { - GGML_ASSERT(lctx.n_outputs == 0); + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); + int32_t * data = (int32_t *) lctx.inp_out_ids->data; + + if (lctx.n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(lctx.n_outputs == n_outputs); + } else if (lctx.n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(lctx.n_outputs == 0); + } } } @@ -18029,9 +18072,14 @@ static int llama_decode_internal( ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false); + struct ggml_tensor * res = nullptr; + struct ggml_tensor * embd = nullptr; + +// TODO: TEMPORARY DISABLED +if (model.arch != LLM_ARCH_OUTETTS_VOC) { // the output is always the last tensor in the graph - struct ggml_tensor * res = ggml_graph_node(gf, -1); - struct ggml_tensor * embd = ggml_graph_node(gf, -2); + res = ggml_graph_node(gf, -1); + embd = ggml_graph_node(gf, -2); if (lctx.n_outputs == 0) { // no output @@ -18051,6 +18099,10 @@ static int llama_decode_internal( embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } +} else { + res = nullptr; + embd = ggml_graph_node(gf, -1); +} // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);