common : preallocate sampling token data vector (#8363)

`emplace_back` repeatedly-called is slower than preallocating the vector to the vocab size and directly inserting the data. Some rudimentary profiling with `chrono` improves the performance of this block of code from ~500us/op to ~40us/op.

Overall, this slightly improves the sampling performance which has a more substantial impact for the `examples/lookahead` implementation -- I am able to see a ~10% performance boost in lookahead inference.
This commit is contained in:
Kevin Wang 2024-07-08 03:26:53 -04:00 committed by GitHub
parent 6f0dbf6ab0
commit 470939d483
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -378,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
*original_logits = {logits, logits + n_vocab};
}
// apply params.logit_bias map
@ -391,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}
cur.clear();
cur.resize(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}
llama_token_data_array cur_p = { cur.data(), cur.size(), false };