server : fix incorrect num_tokens_predicted (#3480)

2024-12-24 10:24:35 +00:00 · 2023-10-05 09:02:55 -05:00 · 2023-10-05 09:02:55 -05:00 · e8b8d32e86
commit e8b8d32e86
parent 8f3a642ec1
1 changed files with 5 additions and 3 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -504,9 +504,11 @@ struct llama_server_context
                                           });
        }

+        bool tg = true;
        while (n_past < embd.size())
        {
            int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
            if (n_eval > params.n_batch)
            {
                n_eval = params.n_batch;
@ -633,8 +635,10 @@ struct llama_server_context

            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
+            if (tg) {
                num_tokens_predicted++;
            }
+        }

        // add it to the context
        embd.push_back(result.tok);
@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);

-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
-
    return json{
        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},