speculative : fix the draft sampling

ggml-ci
2024-12-27 20:04:35 +00:00 · 2024-11-24 12:50:17 +02:00 · 2024-11-24 12:50:17 +02:00 · d9fb3b2e01
commit d9fb3b2e01
parent be5f611000
3 changed files with 26 additions and 11 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -320,7 +320,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return cur_p.data[cur_p.selected].id;
 }

-std::vector<llama_token> common_sampler_sample_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@ -330,25 +330,33 @@ std::vector<llama_token> common_sampler_sample_n(struct common_sampler * gsmpl,
    for (; i < draft.size(); i++) {
        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);

+        common_sampler_accept(gsmpl, id, true);
+
+        result.push_back(id);
+
        if (draft[i] != id) {
            break;
        }
+    }
+
+    if (i == draft.size()) {
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+
+        common_sampler_accept(gsmpl, id, true);

        result.push_back(id);
    }

-    result.push_back(common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first));
-
    return result;
 }

-std::vector<llama_token> common_sampler_sample_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@ -62,19 +62,24 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co

 // generalized version of common_sampler_sample
 //
-// will cross-reference the sampled tokens with a batch of draft tokens
-// if the sampler disagrees at some point, we stop and return the sampled tokens up to now
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
 //
-// `common_sampler_sample_n(gsmpl, ctx, { idx }, {})` is equivalent to `common_sampler_sample(gsmpl, ctx, idx)`
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
 //
 // requires: idxs.size() == draft.size() + 1
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -163,7 +163,9 @@ int main(int argc, char ** argv) {
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
-        const auto ids = common_sampler_sample_n(smpl, ctx_tgt, draft);
+        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
+
+        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());

        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token