From f07cd35da4d5ee69e1a330161e1a5cf9f66d7dbe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 17 Oct 2023 11:40:09 +0300
Subject: [PATCH] speculative : fix off-by-one for n_drafted

---
 examples/speculative/speculative.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 117f1b41f..48cdd4d31 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
                     llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
 
                     // no need to evaluate the last drafted token, since we won't use the result
-                    if (batch_tgt.n_tokens == n_draft) {
+                    if (batch_tgt.n_tokens > n_draft) {
                         drafts[s].drafting = false;
                         continue;
                     }
@@ -358,11 +358,14 @@ int main(int argc, char ** argv) {
             ++n_past_cur;
             ++n_drafted;
 
-            if (batch_tgt.n_tokens >= n_draft) {
+            if (batch_tgt.n_tokens > n_draft) {
                 break;
             }
         }
 
+        // account for the last drafted token that we didn't evaluate
+        ++n_drafted;
+
         // evaluate the target model on the drafted tokens
         {
             llama_kv_cache_seq_keep(ctx_tgt, 0);