From f547c4f54a1f70cc660d194bab4d2b5414be8a7f Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Sun, 12 May 2024 12:06:36 +0200
Subject: [PATCH] Update server.cpp

---
 examples/server/server.cpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ceaeb1f76..32ceab1fb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2029,7 +2029,24 @@ struct server_context {
 
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-
+                                if (slot.ga_n != 1)
+                                {
+                                    int ga_i = 0;
+                                    int32_t ga_n = slot.ga_n;
+                                    int32_t ga_w = slot.ga_w;
+                                    int32_t slot_npast = 0;
+                                    for (int k = 0; k < slot.n_past; ++k)
+                                    {
+                                        while (slot_npast >= ga_i + ga_w) {
+                                            const int bd = (ga_w/ga_n)*(ga_n - 1);
+                                            slot_npast -= bd;
+                                            ga_i += ga_w/ga_n;
+                                        }
+                                        slot_npast++;
+                                    }
+                                    slot.n_past_se = slot_npast;
+                                    slot.ga_i = ga_i;
+                                }
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);