From f07cd35da4d5ee69e1a330161e1a5cf9f66d7dbe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 17 Oct 2023 11:40:09 +0300 Subject: [PATCH] speculative : fix off-by-one for n_drafted --- examples/speculative/speculative.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 117f1b41f..48cdd4d31 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -336,7 +336,7 @@ int main(int argc, char ** argv) { llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true); // no need to evaluate the last drafted token, since we won't use the result - if (batch_tgt.n_tokens == n_draft) { + if (batch_tgt.n_tokens > n_draft) { drafts[s].drafting = false; continue; } @@ -358,11 +358,14 @@ int main(int argc, char ** argv) { ++n_past_cur; ++n_drafted; - if (batch_tgt.n_tokens >= n_draft) { + if (batch_tgt.n_tokens > n_draft) { break; } } + // account for the last drafted token that we didn't evaluate + ++n_drafted; + // evaluate the target model on the drafted tokens { llama_kv_cache_seq_keep(ctx_tgt, 0);