mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 19:04:35 +00:00
server : fix slot selection by lru (#10126)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
update-flake-lock / lockfile (push) Has been cancelled
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-musa.Dockerfile platforms:linux/amd64 tag:full-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-musa.Dockerfile platforms:linux/amd64 tag:light-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-musa.Dockerfile platforms:linux/amd64 tag:server-musa]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python check requirements.txt / check-requirements (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
update-flake-lock / lockfile (push) Has been cancelled
* server : fix slot selection by lru, migrate lcs to `size_t` * minor debug log fix
This commit is contained in:
parent
45950415ed
commit
42cadc74bd
@ -247,6 +247,7 @@ struct server_slot {
|
|||||||
if (is_processing()) {
|
if (is_processing()) {
|
||||||
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
|
||||||
|
|
||||||
|
t_last_used = ggml_time_us();
|
||||||
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
|
||||||
state = SLOT_STATE_IDLE;
|
state = SLOT_STATE_IDLE;
|
||||||
callback_on_release(id);
|
callback_on_release(id);
|
||||||
@ -730,7 +731,7 @@ struct server_context {
|
|||||||
|
|
||||||
// find the slot that has at least n% prompt similarity
|
// find the slot that has at least n% prompt similarity
|
||||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
||||||
int max_lcs_len = 0;
|
int lcs_len = 0;
|
||||||
float similarity = 0;
|
float similarity = 0;
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
@ -745,20 +746,21 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||||
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||||
|
|
||||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||||
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||||
|
|
||||||
// select the current slot if the criteria match
|
// select the current slot if the criteria match
|
||||||
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
|
if (cur_lcs_len > lcs_len && cur_similarity > slot_prompt_similarity) {
|
||||||
max_lcs_len = lcs_len;
|
lcs_len = cur_lcs_len;
|
||||||
|
similarity = cur_similarity;
|
||||||
ret = &slot;
|
ret = &slot;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ret != nullptr) {
|
if (ret != nullptr) {
|
||||||
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
|
SLT_DBG(*ret, "selected slot by lcs similarity, lcs_len = %d, similarity = %f\n", lcs_len, similarity);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -453,20 +453,20 @@ static size_t longest_common_subsequence(const llama_tokens & a, const llama_tok
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get the lengths of the input sequences
|
// get the lengths of the input sequences
|
||||||
int a_len = a.size();
|
size_t a_len = a.size();
|
||||||
int b_len = b.size();
|
size_t b_len = b.size();
|
||||||
|
|
||||||
// initialize the maximum length of the longest common subsequence (LCS)
|
// initialize the maximum length of the longest common subsequence (LCS)
|
||||||
int max_length = 0;
|
size_t max_length = 0;
|
||||||
|
|
||||||
// use two rows instead of a 2D matrix to optimize space
|
// use two rows instead of a 2D matrix to optimize space
|
||||||
std::vector<int> prev_row(b_len + 1, 0);
|
std::vector<size_t> prev_row(b_len + 1, 0);
|
||||||
std::vector<int> curr_row(b_len + 1, 0);
|
std::vector<size_t> curr_row(b_len + 1, 0);
|
||||||
|
|
||||||
// iterate through the elements of a
|
// iterate through the elements of a
|
||||||
for (int i = 1; i <= a_len; i++) {
|
for (size_t i = 1; i <= a_len; i++) {
|
||||||
// iterate through the elements of b
|
// iterate through the elements of b
|
||||||
for (int j = 1; j <= b_len; j++) {
|
for (size_t j = 1; j <= b_len; j++) {
|
||||||
// if elements at the current positions match
|
// if elements at the current positions match
|
||||||
if (a[i - 1] == b[j - 1]) {
|
if (a[i - 1] == b[j - 1]) {
|
||||||
// if it's the first element of either sequences, set LCS length to 1
|
// if it's the first element of either sequences, set LCS length to 1
|
||||||
|
Loading…
Reference in New Issue
Block a user