From 5aaf4a8aa6801f1a85fa8f9e05fc196067d86806 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Dec 2024 12:35:47 +0200
Subject: [PATCH] compute hann window

---
 examples/tts/convert_pt_to_hf.py |  5 +----
 examples/tts/tts.cpp             | 15 +++++++++++++++
 gguf-py/gguf/constants.py        |  3 ---
 gguf-py/gguf/tensor_mapping.py   |  4 ----
 include/llama.h                  |  3 ---
 src/llama.cpp                    | 27 +++++++++++----------------
 6 files changed, 27 insertions(+), 30 deletions(-)
diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py
index d06624879..c4a1185a8 100644
--- a/examples/tts/convert_pt_to_hf.py
+++ b/examples/tts/convert_pt_to_hf.py
@@ -70,7 +70,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         # keep only what we need for inference
         if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \
            not key.startswith('state_dict.backbone.') and \
-           not key.startswith('state_dict.head.'):
+           not key.startswith('state_dict.head.out'):
                print('Skipping key: ', key)
                continue
 
@@ -101,9 +101,6 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):
         if new_key.endswith("gamma"):
             new_key = new_key.replace("gamma", "gamma.weight")
 
-        if new_key == "head.istft.window":
-            new_key = "head.istft.window.weight"
-
         size_mb = value.element_size() * value.nelement() / (1024 * 1024)
         print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}")
 
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 684f6b2fb..f402ba8a2 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -57,6 +57,16 @@ static void print_usage(int, char ** argv) {
     LOG("\n");
 }
 
+void fill_hann_window(int length, bool periodic, float * output) {
+    int offset = -1;
+    if (periodic) {
+        offset = 0;
+    }
+    for (int i = 0; i < length; i++) {
+        output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+    }
+}
+
 int main(int argc, char ** argv) {
     common_params params;
 
@@ -171,6 +181,11 @@ int main(int argc, char ** argv) {
     const int n_embd = llama_n_embd(model_cts);
     const float * embd = llama_get_embeddings(ctx_cts);
 
+    const int w = 1280;
+    std::vector<float> hann(w);
+    fill_hann_window(hann.size(), true, hann.data());
+
+
     int n = n_embd*261;
 
     LOG("result:\n");
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index ea74354a4..f1f44c7d2 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -387,7 +387,6 @@ class MODEL_TENSOR(IntEnum):
     POS_NET_ATTN_K       = auto()
     POS_NET_ATTN_V       = auto()
     POS_NET_ATTN_OUT     = auto()
-    HANN_WINDOW          = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -569,7 +568,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.POS_NET_ATTN_K:            "pos_net.{bid}.attn_k",
     MODEL_TENSOR.POS_NET_ATTN_V:            "pos_net.{bid}.attn_v",
     MODEL_TENSOR.POS_NET_ATTN_OUT:          "pos_net.{bid}.attn_output",
-    MODEL_TENSOR.HANN_WINDOW:               "hann_window",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -1429,7 +1427,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.POS_NET_ATTN_K,
         MODEL_TENSOR.POS_NET_ATTN_V,
         MODEL_TENSOR.POS_NET_ATTN_OUT,
-        MODEL_TENSOR.HANN_WINDOW,
     ],
     # TODO
 }
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 93b70a147..5bf1f514a 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -94,10 +94,6 @@ class TensorNameMap:
         MODEL_TENSOR.ROPE_FACTORS_LONG: (),
         MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
 
-        MODEL_TENSOR.HANN_WINDOW: (
-            "head.istft.window", # outetts
-        ),
-
         MODEL_TENSOR.CONV1D: (
             "backbone.embed", # roberta
         ),
diff --git a/include/llama.h b/include/llama.h
index efbb27d21..a4abf395b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -482,9 +482,6 @@ extern "C" {
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
-    // Get a llama model tensor
-    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
-
     // Returns true if the model contains an encoder that requires llama_encode() call
     LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 536ac1df6..2638c89f2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -627,7 +627,6 @@ enum llm_tensor {
     LLM_TENSOR_POS_NET_ATTN_K,
     LLM_TENSOR_POS_NET_ATTN_V,
     LLM_TENSOR_POS_NET_ATTN_OUT,
-    LLM_TENSOR_HANN_WINDOW,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -1635,7 +1634,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_POS_NET_ATTN_K,    "pos_net.%d.attn_k" },
             { LLM_TENSOR_POS_NET_ATTN_V,    "pos_net.%d.attn_v" },
             { LLM_TENSOR_POS_NET_ATTN_OUT,  "pos_net.%d.attn_output" },
-            { LLM_TENSOR_HANN_WINDOW,       "hann_window" },
         },
     },
     {
@@ -3648,6 +3646,17 @@ static int llama_get_device_count(const llama_model & model) {
     return (int) model.devices.size();
 }
 
+static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
+    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
+            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == model->tensors_by_name.end()) {
+        return nullptr;
+    }
+    return it->second;
+}
+
 template<typename F>
 static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
     ggml_init_params params = {
@@ -7462,7 +7471,6 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
     {LLM_TENSOR_CONV_NEXT_PW1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_PW2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CONV_NEXT_GAMMA,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_HANN_WINDOW,                {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
 };
 
 // checks if the weight tensor can be used with the specified buffer type and device
@@ -9638,8 +9646,6 @@ static bool llm_load_tensors(
 
                     model.output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {768, n_embd}, 0);
                     model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"),   {n_embd}, 0);
-
-                    model.hann_window = create_tensor(tn(LLM_TENSOR_HANN_WINDOW, "weight"), {1280}, 0);
                 } break;
             default:
                 throw std::runtime_error("unknown architecture");
@@ -21021,17 +21027,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
     return model->n_elements;
 }
 
-struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
-    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
-            });
-    if (it == model->tensors_by_name.end()) {
-        return nullptr;
-    }
-    return it->second;
-}
-
 bool llama_model_has_encoder(const struct llama_model * model) {
     switch (model->arch) {
         case LLM_ARCH_T5:        return true;