server : fix parallel speculative decoding (#10513)

ggml-ci
2024-12-25 10:54:36 +00:00 · 2024-11-26 13:36:40 +02:00 · 2024-11-26 13:36:40 +02:00 · 84e1c33cde
commit 84e1c33cde
parent 811872a59d
1 changed files with 35 additions and 36 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2267,12 +2267,7 @@ struct server_context {
                    continue; // continue loop of slots
                }

-                llama_token id;
-
-                {
-                    completion_token_output result;
-
-                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);

                slot.i_batch = -1;

@ -2285,6 +2280,7 @@ struct server_context {
                    metrics.on_prompt_eval(slot);
                }

+                completion_token_output result;
                result.tok = id;

                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
@ -2306,11 +2302,14 @@ struct server_context {
                }
            }

-                // check if the slot supports speculative decoding
-                if (!slot.can_speculate()) {
+            // do speculative decoding
+            for (auto & slot : slots) {
+                if (!slot.is_processing() || !slot.can_speculate()) {
                    continue;
                }

+                llama_token id = slot.sampled;
+
                struct common_speculative_params params_spec;
                params_spec.n_draft   = slot.params.speculative.n_max;
                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;