mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-01 06:14:35 +00:00
speculative : do not redraft previous drafts
ggml-ci
This commit is contained in:
parent
2e197a1f21
commit
be5f611000
@ -27,7 +27,7 @@ struct common_speculative * common_speculative_init(
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: optimize or pass from outside?
|
// TODO: optimize or pass from outside?
|
||||||
#if 1
|
#if 0
|
||||||
{
|
{
|
||||||
common_sampler_params sparams;
|
common_sampler_params sparams;
|
||||||
sparams.no_perf = false;
|
sparams.no_perf = false;
|
||||||
@ -156,13 +156,27 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d\n", __func__, reuse_i, reuse_n);
|
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
|
||||||
|
|
||||||
|
llama_tokens result;
|
||||||
|
result.reserve(params.n_draft);
|
||||||
|
|
||||||
if (reuse_n == 0) {
|
if (reuse_n == 0) {
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
prompt.clear();
|
prompt.clear();
|
||||||
} else {
|
} else {
|
||||||
|
if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
|
||||||
|
for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
|
||||||
|
result.push_back(prompt[i]);
|
||||||
|
|
||||||
|
if (result.size() >= params.n_draft) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
|
||||||
llama_kv_cache_seq_rm (ctx, 0, reuse_i + reuse_n, -1);
|
llama_kv_cache_seq_rm (ctx, 0, reuse_i + reuse_n, -1);
|
||||||
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
||||||
@ -201,9 +215,6 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
|
|
||||||
common_sampler_reset(smpl);
|
common_sampler_reset(smpl);
|
||||||
|
|
||||||
llama_tokens result;
|
|
||||||
result.reserve(params.n_draft);
|
|
||||||
|
|
||||||
// sample n_draft tokens from the draft model
|
// sample n_draft tokens from the draft model
|
||||||
for (int i = 0; i < params.n_draft; ++i) {
|
for (int i = 0; i < params.n_draft; ++i) {
|
||||||
common_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
@ -134,6 +134,8 @@ int main(int argc, char ** argv) {
|
|||||||
//
|
//
|
||||||
llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
|
llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
|
||||||
|
|
||||||
|
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
|
||||||
|
|
||||||
// always have a token to evaluate from before - id_last
|
// always have a token to evaluate from before - id_last
|
||||||
common_batch_clear(batch_tgt);
|
common_batch_clear(batch_tgt);
|
||||||
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
||||||
|
Loading…
Reference in New Issue
Block a user