server : fix parallel speculative decoding (#10513)

ggml-ci
2024-12-26 19:34:35 +00:00 · 2024-11-26 13:36:40 +02:00 · 2024-11-26 13:36:40 +02:00 · 84e1c33cde
commit 84e1c33cde
parent 811872a59d
1 changed files with 35 additions and 36 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2267,49 +2267,48 @@ struct server_context {
                    continue; // continue loop of slots
                }
-                llama_token id;
+                llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
-                {
+                slot.i_batch = -1;
                    completion_token_output result;
-                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
+                common_sampler_accept(slot.smpl, id, true);
-                    slot.i_batch = -1;
+                slot.n_decoded += 1;
-
+                if (slot.n_decoded == 1) {
-                    common_sampler_accept(slot.smpl, id, true);
+                    slot.t_start_generation = ggml_time_us();
-
+                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
-                    slot.n_decoded += 1;
+                    metrics.on_prompt_eval(slot);
                    if (slot.n_decoded == 1) {
                        slot.t_start_generation = ggml_time_us();
                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                        metrics.on_prompt_eval(slot);
                    }
                    result.tok = id;
                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
                        result.probs.push_back({
                            cur_p->data[i].id,
                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                        });
                    }
                    if (!process_token(result, slot)) {
                        // release slot because of stop condition
                        slot.release();
                        slot.print_timings();
                        send_final_response(slot);
                        metrics.on_prediction(slot);
                        continue;
                    }
                }
-                // check if the slot supports speculative decoding
+                completion_token_output result;
-                if (!slot.can_speculate()) {
+                result.tok = id;
                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
                for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
                    result.probs.push_back({
                        cur_p->data[i].id,
                            i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                    });
                }
                if (!process_token(result, slot)) {
                    // release slot because of stop condition
                    slot.release();
                    slot.print_timings();
                    send_final_response(slot);
                    metrics.on_prediction(slot);
                    continue;
                }
            }
            // do speculative decoding
            for (auto & slot : slots) {
                if (!slot.is_processing() || !slot.can_speculate()) {
                    continue;
                }
                llama_token id = slot.sampled;
                struct common_speculative_params params_spec;
                params_spec.n_draft   = slot.params.speculative.n_max;