fix n_embd + remove llama.cpp hacks

2024-12-27 20:04:35 +00:00 · 2024-12-11 10:59:08 +02:00 · 2024-12-11 10:59:08 +02:00 · 86d0ad5ef4
commit 86d0ad5ef4
parent dcf2230afb
3 changed files with 13 additions and 31 deletions
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@ -152,7 +152,7 @@ config = {
    "architectures": [
        "OuteTTSVocoder"
    ],
-    "hidden_size": 512,
+    "hidden_size": 1282,
    "vocab_size": 4096,
    "n_head": 1,
    "layer_norm_epsilon": 1e-6,
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@ -168,9 +168,10 @@ int main(int argc, char ** argv) {
    LOG_INF("%s: time for prompt: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f);
    const int n_embd = llama_n_embd(model_cts);
    const float * embd = llama_get_embeddings(ctx_cts);
-    int n = 1282*261;
+    int n = n_embd*261;
    LOG("result:\n");
    for (int i = 0; i < 10; ++i) {
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -9539,12 +9539,12 @@ static bool llm_load_tensors(
                } break;
            case LLM_ARCH_OUTETTS_VOC:
                {
-                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+                    model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {512, n_vocab}, 0);
                    model.tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {768}, 0);
                    model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {768}, 0);
-                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, n_embd, 768}, 0);
+                    model.conv_1d   = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, 512, 768}, 0);
                    model.conv_1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"),   {768}, 0);
                    model.posnet_0_norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", 0), {768}, 0);
@ -9636,8 +9636,8 @@ static bool llm_load_tensors(
                    model.output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {768}, 0);
                    model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {768}, 0);
-                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, 1282}, 0);
+                    model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
-                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {1282}, 0);
+                    model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
                    model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
                } break;
@ -17432,14 +17432,12 @@ struct llm_build_context {
                model.output_norm,
                model.output_norm_b,
                LLM_NORM, cb, -1);
        cb(cur, "result_norm", -1);
        // lm_head
        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
        cb(cur, "result_output_no_bias", -1);
        cur = ggml_add(ctx0, cur, model.output_b);
-        cb(cur, "result_output", -1);
+        cb(cur, "result_embd", -1);
        printf("cur: %d %d %d\n", cur->ne[0], cur->ne[1], cur->ne[2]);
@ -17732,8 +17730,7 @@ static struct ggml_cgraph * llama_build_graph(
    // add on pooling layer
    if (lctx.cparams.embeddings) {
-        // TODO: TEMPORARY DISABLED [OUTETTS]
+        result = llm.append_pooling(result);
        //result = llm.append_pooling(result);
    }
    llm.free();
@ -18221,13 +18218,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
    }
    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
    // TODO: TEMPORARY !!! [OUTETTS]
 #if 0
    const size_t new_size  = (logits_size + embd_size) * sizeof(float);
 #else
    const size_t new_size  = 1024*1024*32;
 #endif
    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
@ -18501,14 +18492,9 @@ static int llama_decode_internal(
        ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
        struct ggml_tensor * res  = nullptr;
        struct ggml_tensor * embd = nullptr;
 // TODO: TEMPORARY DISABLED [OUTETTS]
 if (model.arch != LLM_ARCH_OUTETTS_VOC) {
        // the output is always the last tensor in the graph
-        res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
-        embd = ggml_graph_node(gf, -2);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
        if (lctx.n_outputs == 0) {
            // no output
@ -18528,10 +18514,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
            embd = nullptr; // do not extract embeddings when not needed
            GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
        }
-} else {
+
        res  = nullptr;
        embd = ggml_graph_node(gf, -1);
 }
        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
        ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
@ -18599,9 +18582,7 @@ if (model.arch != LLM_ARCH_OUTETTS_VOC) {
                        if (n_outputs_new) {
                            GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
                            GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
-                            // TODO: TEMPORARY [OUTETTS]
+                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
                            //ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*1282*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_MEAN: