speculative : do not redraft previous drafts

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-11-24 12:09:31 +02:00
parent 2e197a1f21
commit be5f611000
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
2 changed files with 18 additions and 5 deletions

View File

@ -27,7 +27,7 @@ struct common_speculative * common_speculative_init(
}; };
// TODO: optimize or pass from outside? // TODO: optimize or pass from outside?
#if 1 #if 0
{ {
common_sampler_params sparams; common_sampler_params sparams;
sparams.no_perf = false; sparams.no_perf = false;
@ -156,13 +156,27 @@ llama_tokens common_speculative_gen_draft(
} }
} }
LOG_DBG("%s: reuse_i = %d, reuse_n = %d\n", __func__, reuse_i, reuse_n); LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
llama_tokens result;
result.reserve(params.n_draft);
if (reuse_n == 0) { if (reuse_n == 0) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
prompt.clear(); prompt.clear();
} else { } else {
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
result.push_back(prompt[i]);
if (result.size() >= params.n_draft) {
break;
}
}
return result;
}
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
llama_kv_cache_seq_rm (ctx, 0, reuse_i + reuse_n, -1); llama_kv_cache_seq_rm (ctx, 0, reuse_i + reuse_n, -1);
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
@ -201,9 +215,6 @@ llama_tokens common_speculative_gen_draft(
common_sampler_reset(smpl); common_sampler_reset(smpl);
llama_tokens result;
result.reserve(params.n_draft);
// sample n_draft tokens from the draft model // sample n_draft tokens from the draft model
for (int i = 0; i < params.n_draft; ++i) { for (int i = 0; i < params.n_draft; ++i) {
common_batch_clear(batch); common_batch_clear(batch);

View File

@ -134,6 +134,8 @@ int main(int argc, char ** argv) {
// //
llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last); llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
// always have a token to evaluate from before - id_last // always have a token to evaluate from before - id_last
common_batch_clear(batch_tgt); common_batch_clear(batch_tgt);
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true); common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);