speculative : more robust tokenizer comparison

2024-12-28 12:24:35 +00:00 · 2024-04-04 18:25:19 -04:00 · 2024-04-04 18:25:19 -04:00 · a37696d4f1
commit a37696d4f1
parent 92591c125f
1 changed files with 22 additions and 12 deletions
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -76,6 +76,28 @@ int main(int argc, char ** argv) {
    params.n_threads_batch = params.n_threads_batch_draft;
    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
    LOG("vocab_type tgt: %d\n", vocab_type_tgt);
    const bool vocab_type_dft = llama_vocab_type(model_dft);
    LOG("vocab_type dft: %d\n", vocab_type_dft);
    if (vocab_type_tgt != vocab_type_dft) {
        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
        return 1;
    }
    if (
        llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
        llama_token_eos(model_tgt) != llama_token_eos(model_dft)
    ) {
        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
        return 1;
    }
    {
        const int n_vocab_tgt = llama_n_vocab(model_tgt);
        const int n_vocab_dft = llama_n_vocab(model_dft);
@ -105,18 +127,6 @@ int main(int argc, char ** argv) {
    // Tokenize the prompt
    const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
    LOG("add_bos tgt: %d\n", add_bos_tgt);
    const bool add_bos_dft = llama_should_add_bos_token(model_dft);
    LOG("add_bos dft: %d\n", add_bos_dft);
    if (add_bos_tgt != add_bos_dft) {
        fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
        fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
        return 1;
    }
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);