mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
Apply min_p to unsorted tokens (#5115)
This commit is contained in:
parent
b2b2bf988c
commit
9241c3a2ac
44
llama.cpp
44
llama.cpp
@ -52,6 +52,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cfloat>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <climits>
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
@ -8246,21 +8247,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sample_softmax(ctx, candidates);
|
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
float scale = candidates->data[0].p; // scale by max prob
|
bool min_p_applied = false;
|
||||||
|
|
||||||
|
// if the candidates aren't sorted, try the unsorted implementation first
|
||||||
|
if (!candidates->sorted) {
|
||||||
|
std::vector<llama_token_data> filtered_tokens;
|
||||||
|
|
||||||
|
float max_logit = -FLT_MAX;
|
||||||
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
|
max_logit = std::max(max_logit, candidates->data[i].logit);
|
||||||
|
}
|
||||||
|
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
|
|
||||||
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
|
if (candidates->data[i].logit >= min_logit) {
|
||||||
|
filtered_tokens.push_back(candidates->data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if we have enough values the operation was a success
|
||||||
|
if (filtered_tokens.size() >= min_keep) {
|
||||||
|
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
||||||
|
candidates->size = filtered_tokens.size();
|
||||||
|
min_p_applied = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
||||||
|
if (!min_p_applied) {
|
||||||
|
// Sort the logits in descending order
|
||||||
|
if (!candidates->sorted) {
|
||||||
|
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
|
return a.logit > b.logit;
|
||||||
|
});
|
||||||
|
candidates->sorted = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
||||||
size_t i = 1; // first token always matches
|
size_t i = 1; // first token always matches
|
||||||
|
|
||||||
for (; i < candidates->size; ++i) {
|
for (; i < candidates->size; ++i) {
|
||||||
if (candidates->data[i].p < p * scale && i >= min_keep) {
|
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
||||||
break; // prob too small
|
break; // prob too small
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resize the output vector to keep only the matching tokens
|
// Resize the output vector to keep only the matching tokens
|
||||||
candidates->size = i;
|
candidates->size = i;
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
Loading…
Reference in New Issue
Block a user