From 0ba40c36150e29d1b7893a35e41f806e43b596e3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 25 Nov 2024 10:16:27 +0200
Subject: [PATCH] server : add helper function slot.can_speculate()

ggml-ci
---
 examples/server/server.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index eb89dd90f..f9d20fee5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -241,6 +241,10 @@ struct server_slot {
         return state != SLOT_STATE_IDLE;
     }
 
+    bool can_speculate() const {
+        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
+    }
+
     void add_token(const completion_token_output & token) {
         if (!is_processing()) {
             SLT_WRN(*this, "%s", "slot is not processing\n");
@@ -1270,7 +1274,7 @@ struct server_context {
             {"min_keep",                  slot.params.sampling.min_keep},
             {"grammar",                   slot.params.sampling.grammar},
             {"samplers",                  samplers},
-            {"speculative",               slot.params.speculative.model.empty() ? false : true},
+            {"speculative",               slot.can_speculate()},
             {"speculative.n_max",         slot.params.speculative.n_max},
             {"speculative.n_min",         slot.params.speculative.n_min},
             {"speculative.p_min",         slot.params.speculative.p_min},
@@ -2302,11 +2306,10 @@ struct server_context {
                 }
 
                 // check if the slot supports speculative decoding
-                if (!slot.ctx_dft || slot.params.speculative.n_max <= 0 || !slot.params.cache_prompt) {
+                if (!slot.can_speculate()) {
                     continue;
                 }
 
-                // TODO: configurable through requests
                 struct common_speculative_params params_spec;
                 params_spec.n_draft   = slot.params.speculative.n_max;
                 params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;