From e08f8a5d8c327efec5369aade6a7d92c4ccb2f1b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Dec 2024 18:23:10 +0200
Subject: [PATCH] extract features

---
 examples/tts/convert_pt_to_hf.py |   1 +
 examples/tts/tts.cpp             |  15 ++++-
 src/llama.cpp                    | 108 +++++++++++++++++++++++--------
 3 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index a652bae43..389d2de50 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -138,6 +138,7 @@ config = {
     ],
     "hidden_size": 512,
     "vocab_size": 4096,
+    "n_head": 1,
     "max_position_embeddings": 8192, # ?
     "num_hidden_layers": 12
 }
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 768015a52..d3fee7373 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -88,6 +88,7 @@ int main(int argc, char ** argv) {
     ctx_ttc = llama_init_ttc.context;
 
     params.model = params.vocoder.model;
+    params.embedding = true;
 
     common_init_result llama_init_cts = common_init_from_params(params);
     model_cts = llama_init_cts.model;
@@ -146,6 +147,9 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: prompt audio size: %d\n", __func__, (int) prompt_inp.size());
     }
 
+    for (auto & token : prompt_inp) {
+        token -= 151672;
+    }
 
     llama_batch batch = llama_batch_init(prompt_inp.size(), 0, 1);
 
@@ -155,22 +159,27 @@ int main(int argc, char ** argv) {
     }
     GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
 
-    if (llama_decode(ctx_ttc, batch) != 0) {
+    if (llama_decode(ctx_cts, batch) != 0) {
         LOG_ERR("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
-    llama_synchronize(ctx_ttc);
+    llama_synchronize(ctx_cts);
 
     LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
 
-    const float * embd = llama_get_embeddings(ctx_ttc);
+    const float * embd = llama_get_embeddings(ctx_cts);
 
     LOG("result:\n");
     for (int i = 0; i < 10; ++i) {
         LOG("%8.3f ", embd[i]);
     }
     LOG("\n");
+    double sum = 0.0;
+    for (int i = 0; i < 261*512; ++i) {
+        sum += embd[i];
+    }
+    LOG("sum: %f\n", sum);
 
     fprintf(stderr, "\n");
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 2ab5322c3..4fc676a1e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3614,7 +3614,9 @@ static bool llama_kv_cache_init(
 
     const struct llama_hparams & hparams = model.hparams;
 
-    const int64_t  n_layer = hparams.n_layer;
+    const int32_t n_layer = hparams.n_layer;
+
+    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
 
     cache.has_shift = false;
 
@@ -3655,10 +3657,12 @@ static bool llama_kv_cache_init(
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
-    for (int i = 0; i < (int) n_layer; i++) {
+    for (int i = 0; i < n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
 
+        LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
+
         ggml_backend_buffer_type_t buft;
         if (offload) {
             auto * dev = model.dev_layer.at(i).dev;
@@ -5032,7 +5036,8 @@ struct llama_model_loader {
 
     void done_getting_tensors() const {
         if (n_created != n_tensors) {
-            throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
+            // TODO: TEMPORARY DISABLED
+            //throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
         }
     }
 
@@ -9422,6 +9427,10 @@ static bool llm_load_tensors(
             case LLM_ARCH_OUTETTS_VOC:
                 {
                     model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+                    // output
+                    model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
+                    model.output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {768, 1282}, llama_model_loader::TENSOR_NOT_REQUIRED);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
@@ -16991,6 +17000,30 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_outetts_voc() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
+
+        cur = inpL;
+
+        //cur = llm_build_norm(ctx0, cur, hparams,
+        //        model.output_norm, NULL,
+        //        LLM_NORM_RMS, cb, -1);
+        //cb(cur, "result_norm", -1);
+
+        //// lm_head
+        //cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        //cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -17266,13 +17299,18 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_chameleon();
             } break;
+        case LLM_ARCH_OUTETTS_VOC:
+            {
+                result = llm.build_outetts_voc();
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
 
     // add on pooling layer
     if (lctx.cparams.embeddings) {
-        result = llm.append_pooling(result);
+        // TODO: TEMPORARY DISABLED
+        //result = llm.append_pooling(result);
     }
 
     llm.free();
@@ -17357,30 +17395,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
     }
 
     if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
-        const int64_t n_tokens = ubatch.n_tokens;
+        //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
-        int32_t * data = (int32_t *) lctx.inp_out_ids->data;
-
-        if (lctx.n_outputs == n_tokens) {
-            for (int i = 0; i < n_tokens; ++i) {
-                data[i] = i;
-            }
-        } else if (ubatch.output) {
-            int32_t n_outputs = 0;
-            for (int i = 0; i < n_tokens; ++i) {
-                if (ubatch.output[i]) {
-                    data[n_outputs++] = i;
-                }
-            }
-            // the graph needs to have been passed the correct number of outputs
-            GGML_ASSERT(lctx.n_outputs == n_outputs);
-        } else if (lctx.n_outputs == 1) {
-            // only keep last output
-            data[0] = n_tokens - 1;
+        if (!lctx.inp_out_ids) {
+            LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
         } else {
-            GGML_ASSERT(lctx.n_outputs == 0);
+            const int64_t n_tokens = ubatch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
+            int32_t * data = (int32_t *) lctx.inp_out_ids->data;
+
+            if (lctx.n_outputs == n_tokens) {
+                for (int i = 0; i < n_tokens; ++i) {
+                    data[i] = i;
+                }
+            } else if (ubatch.output) {
+                int32_t n_outputs = 0;
+                for (int i = 0; i < n_tokens; ++i) {
+                    if (ubatch.output[i]) {
+                        data[n_outputs++] = i;
+                    }
+                }
+                // the graph needs to have been passed the correct number of outputs
+                GGML_ASSERT(lctx.n_outputs == n_outputs);
+            } else if (lctx.n_outputs == 1) {
+                // only keep last output
+                data[0] = n_tokens - 1;
+            } else {
+                GGML_ASSERT(lctx.n_outputs == 0);
+            }
         }
     }
 
@@ -18029,9 +18072,14 @@ static int llama_decode_internal(
 
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
+        struct ggml_tensor * res  = nullptr;
+        struct ggml_tensor * embd = nullptr;
+
+// TODO: TEMPORARY DISABLED
+if (model.arch != LLM_ARCH_OUTETTS_VOC) {
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
-        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
+        res  = ggml_graph_node(gf, -1);
+        embd = ggml_graph_node(gf, -2);
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -18051,6 +18099,10 @@ static int llama_decode_internal(
             embd = nullptr; // do not extract embeddings when not needed
             GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
         }
+} else {
+        res  = nullptr;
+        embd = ggml_graph_node(gf, -1);
+}
         // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
         ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);